16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ******************************************************************************* 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copyright (C) 2006-2013, International Business Machines Corporation 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ******************************************************************************* 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_BREAK_ITERATION 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "brkeng.h" 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "dictbe.h" 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h" 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/chariter.h" 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ubrk.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uvector.h" 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "uassert.h" 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/normlzr.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "cmemory.h" 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "dictionarydata.h" 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ****************************************************************** 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgDictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fTypes = breakTypes; 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgDictionaryBreakEngine::~DictionaryBreakEngine() { 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgUBool 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgDictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org && fSet.contains(c)); 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgDictionaryBreakEngine::findBreaks( UText *text, 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t startPos, 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t endPos, 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool reverse, 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t breakType, 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStack &foundBreaks ) const { 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t result = 0; 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Find the span of characters included in the set. 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start = (int32_t)utext_getNativeIndex(text); 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t current; 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeStart; 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeEnd; 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 c = utext_current32(text); 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (reverse) { 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isDict = fSet.contains(c); 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c = utext_previous32(text); 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org isDict = fSet.contains(c); 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rangeEnd = start + 1; 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); // TODO: recast loop for postincrement 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org c = utext_current32(text); 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rangeStart = start; 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org rangeEnd = current; 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current); 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return result; 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgDictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fSet = set; 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Compact for caching 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fSet.compact(); 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ****************************************************************** 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * PossibleWord 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Helper class for improving readability of the Thai/Lao/Khmer word break 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// algorithm. The implementation is completely inline. 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// List size, limited by the maximum number of words in the dictionary 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// that form a nested sequence. 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define POSSIBLE_WORD_LIST_MAX 20 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass PossibleWord { 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprivate: 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // list of word candidate lengths, in increasing length order 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count; // Count of candidates 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t prefix; // The longest match with a dictionary word 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t offset; // Offset in the text of these candidates 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int mark; // The preferred candidate's offset 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int current; // The candidate we're currently looking at 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org PossibleWord(); 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ~PossibleWord(); 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Fill the list of candidates if needed, select the longest, and return the number found 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Select the currently marked candidate, point after it in the text, and invalidate self 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t acceptMarked( UText *text ); 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Back up from the current candidate to the next shorter one; return TRUE if that exists 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and point the text after it 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool backUp( UText *text ); 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Return the longest prefix this candidate location shares with a dictionary word 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t longestPrefix(); 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Mark the current candidate as the one we like 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void markCurrent(); 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::PossibleWord() { 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offset = -1; 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::~PossibleWord() { 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline int 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t start = (int32_t)utext_getNativeIndex(text); 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (start != offset) { 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org offset = start; 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Dictionary leaves text after longest prefix, not longest word. Back up. 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (count <= 0) { 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, start); 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (count > 0) { 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, start+lengths[count-1]); 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org current = count-1; 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org mark = current; 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return count; 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline int32_t 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::acceptMarked( UText *text ) { 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, offset + lengths[mark]); 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return lengths[mark]; 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline UBool 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::backUp( UText *text ) { 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (current > 0) { 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, offset + lengths[--current]); 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return TRUE; 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return FALSE; 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline int32_t 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::longestPrefix() { 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return prefix; 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orginline void 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgPossibleWord::markCurrent() { 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org mark = current; 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ****************************************************************** 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * ThaiBreakEngine 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// How many words in a row are "good enough"? 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_LOOKAHEAD 3 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Will not combine a non-word with a preceding dictionary word longer than this 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_ROOT_COMBINE_THRESHOLD 3 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Will not combine a non-word that shares at least this much prefix with a 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// dictionary word, with a preceding word 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_PREFIX_COMBINE_THRESHOLD 3 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Ellision character 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_PAIYANNOI 0x0E2F 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Repeat character 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_MAIYAMOK 0x0E46 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Minimum word size 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_MIN_WORD 2 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Minimum number of characters for two words 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fDictionary(adoptDictionary) 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_SUCCESS(status)) { 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setCharacters(fThaiWordSet); 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.add(0x0020); 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet = fThaiWordSet; 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fSuffixSet.add(THAI_PAIYANNOI); 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fSuffixSet.add(THAI_MAIYAMOK); 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Compact for caching. 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.compact(); 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.compact(); 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.compact(); 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fSuffixSet.compact(); 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgThaiBreakEngine::~ThaiBreakEngine() { 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete fDictionary; 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgThaiBreakEngine::divideUpDictionaryRange( UText *text, 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeStart, 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeEnd, 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStack &foundBreaks ) const { 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Not enough characters for two words 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t wordsFound = 0; 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t wordLength; 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t current; 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org PossibleWord words[THAI_LOOKAHEAD]; 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 uc; 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, rangeStart); 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = 0; 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for candidate words at the current position 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we found exactly one, use that 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (candidates == 1) { 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If there was more than one, see which one can take us forward the most words 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else if (candidates > 1) { 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we're already at the end of the range, we're done 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int wordsMatched = 1; 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordsMatched < 2) { 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Followed by another dictionary word; mark first word as a good candidate 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsMatched = 2; 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we're already at the end of the range, we're done 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // See if any of the possible second words is followed by a third word 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we find a third word, stop right away 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgfoundBest: 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We come here after having either found a word or not. We look ahead to the 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // next word. If it's not a dictionary word, we will combine it withe the word we 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // just found (if there is one), but only if the preceding word does not exceed 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the threshold. 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The text iterator should now be positioned at the end of the word we found. 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if it is a dictionary word, do nothing. If it isn't, then if there is 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // no preceding word, or the non-word shares less than the minimum threshold 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // of characters with a dictionary word, then scan to resynchronize 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org && (wordLength == 0 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for a plausible word boundary 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //TODO: This section will need a rework for UText. 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t remaining = rangeEnd - (current+wordLength); 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 pc = utext_current32(text); 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t chars = 0; 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (;;) { 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uc = utext_current32(text); 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Here we're counting on the fact that the SA languages are all 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in the BMP. This should get fixed with the UText rework. 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org chars += 1; 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (--remaining <= 0) { 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Maybe. See if it's in the dictionary. 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // NOTE: In the original Apple code, checked that the next 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // two characters after uc were not 0x0E4C THANTHAKHAT before 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // checking the dictionary. That is just a performance filter, 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // but it's not clear it's faster than checking the trie. 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current + wordLength + chars); 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (candidates > 0) { 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pc = uc; 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Bump the word count if there wasn't already one 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordLength <= 0) { 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Update the length with the passed-over characters 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += chars; 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Back up to where we were for next iteration 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current+wordLength); 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Never stop before a combining mark. 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t currPos; 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look ahead for possible suffixes if a dictionary word does not follow. 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We do this in code rather than using a rule so that the heuristic 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // resynch continues to function. For example, one of the suffix characters 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // could be a typo in the middle of a word. 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org && fSuffixSet.contains(uc = utext_current32(text))) { 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (uc == THAI_PAIYANNOI) { 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!fSuffixSet.contains(utext_previous32(text))) { 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip over previous end and PAIYANNOI 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += 1; // Add PAIYANNOI to word 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uc = utext_current32(text); // Fetch next character 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Restore prior position 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (uc == THAI_MAIYAMOK) { 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (utext_previous32(text) != THAI_MAIYAMOK) { 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip over previous end and MAIYAMOK 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += 1; // Add MAIYAMOK to word 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Restore prior position 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current+wordLength); 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Did we find a word on this iteration? If so, push it on the break stack 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordLength > 0) { 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org foundBreaks.push((current+wordLength), status); 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Don't return a break for the end of the dictionary range if there is one there. 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (foundBreaks.peeki() >= rangeEnd) { 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (void) foundBreaks.popi(); 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound -= 1; 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return wordsFound; 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ****************************************************************** 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * LaoBreakEngine 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// How many words in a row are "good enough"? 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define LAO_LOOKAHEAD 3 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Will not combine a non-word with a preceding dictionary word longer than this 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define LAO_ROOT_COMBINE_THRESHOLD 3 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Will not combine a non-word that shares at least this much prefix with a 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// dictionary word, with a preceding word 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define LAO_PREFIX_COMBINE_THRESHOLD 3 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Minimum word size 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define LAO_MIN_WORD 2 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Minimum number of characters for two words 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2) 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgLaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fDictionary(adoptDictionary) 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_SUCCESS(status)) { 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setCharacters(fLaoWordSet); 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.add(0x0020); 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet = fLaoWordSet; 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Compact for caching. 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.compact(); 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.compact(); 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.compact(); 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgLaoBreakEngine::~LaoBreakEngine() { 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete fDictionary; 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgLaoBreakEngine::divideUpDictionaryRange( UText *text, 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeStart, 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeEnd, 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStack &foundBreaks ) const { 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Not enough characters for two words 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t wordsFound = 0; 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t wordLength; 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t current; 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org PossibleWord words[LAO_LOOKAHEAD]; 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 uc; 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, rangeStart); 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = 0; 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for candidate words at the current position 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we found exactly one, use that 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (candidates == 1) { 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If there was more than one, see which one can take us forward the most words 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else if (candidates > 1) { 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we're already at the end of the range, we're done 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int wordsMatched = 1; 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordsMatched < 2) { 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Followed by another dictionary word; mark first word as a good candidate 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsMatched = 2; 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we're already at the end of the range, we're done 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // See if any of the possible second words is followed by a third word 5206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 5216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we find a third word, stop right away 5226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 5236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 5246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 5256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 5286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 5316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgfoundBest: 5326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 5336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 5346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We come here after having either found a word or not. We look ahead to the 5376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // next word. If it's not a dictionary word, we will combine it withe the word we 5386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // just found (if there is one), but only if the preceding word does not exceed 5396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the threshold. 5406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The text iterator should now be positioned at the end of the word we found. 5416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) { 5426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if it is a dictionary word, do nothing. If it isn't, then if there is 5436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // no preceding word, or the non-word shares less than the minimum threshold 5446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // of characters with a dictionary word, then scan to resynchronize 5456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 5466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org && (wordLength == 0 5476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 5486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for a plausible word boundary 5496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //TODO: This section will need a rework for UText. 5506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t remaining = rangeEnd - (current+wordLength); 5516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 pc = utext_current32(text); 5526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t chars = 0; 5536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (;;) { 5546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 5556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uc = utext_current32(text); 5566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Here we're counting on the fact that the SA languages are all 5576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in the BMP. This should get fixed with the UText rework. 5586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org chars += 1; 5596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (--remaining <= 0) { 5606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 5636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Maybe. See if it's in the dictionary. 5646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 5656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current + wordLength + chars); 5666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (candidates > 0) { 5676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 5686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pc = uc; 5716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Bump the word count if there wasn't already one 5746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordLength <= 0) { 5756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 5766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Update the length with the passed-over characters 5796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += chars; 5806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 5826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Back up to where we were for next iteration 5836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current+wordLength); 5846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Never stop before a combining mark. 5886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t currPos; 5896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 5906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 5916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 5926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 5936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look ahead for possible suffixes if a dictionary word does not follow. 5956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We do this in code rather than using a rule so that the heuristic 5966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // resynch continues to function. For example, one of the suffix characters 5976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // could be a typo in the middle of a word. 5986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // NOT CURRENTLY APPLICABLE TO LAO 5996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Did we find a word on this iteration? If so, push it on the break stack 6016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordLength > 0) { 6026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org foundBreaks.push((current+wordLength), status); 6036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Don't return a break for the end of the dictionary range if there is one there. 6076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (foundBreaks.peeki() >= rangeEnd) { 6086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (void) foundBreaks.popi(); 6096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound -= 1; 6106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return wordsFound; 6136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 6146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 6166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ****************************************************************** 6176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * KhmerBreakEngine 6186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 6196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// How many words in a row are "good enough"? 6216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define KHMER_LOOKAHEAD 3 6226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Will not combine a non-word with a preceding dictionary word longer than this 6244dfa619cf375ebb67b7b9311487d19a4129f742fjshin@chromium.org#define KHMER_ROOT_COMBINE_THRESHOLD 10 6256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Will not combine a non-word that shares at least this much prefix with a 6276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// dictionary word, with a preceding word 6284dfa619cf375ebb67b7b9311487d19a4129f742fjshin@chromium.org#define KHMER_PREFIX_COMBINE_THRESHOLD 5 6296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Minimum word size 6316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define KHMER_MIN_WORD 2 6326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Minimum number of characters for two words 6346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 6356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgKhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 6376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 6386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fDictionary(adoptDictionary) 6396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org{ 6406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 6416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_SUCCESS(status)) { 6426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setCharacters(fKhmerWordSet); 6436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 6456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.add(0x0020); 6466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet = fKhmerWordSet; 6476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.add(0x1780, 0x17B3); 6486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 6496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 6506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 6516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 6526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 6536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 6546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 6556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 6566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 6576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fSuffixSet.add(THAI_PAIYANNOI); 6586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fSuffixSet.add(THAI_MAIYAMOK); 6596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Compact for caching. 6616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fMarkSet.compact(); 6626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fEndWordSet.compact(); 6636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fBeginWordSet.compact(); 6646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// fSuffixSet.compact(); 6656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 6666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgKhmerBreakEngine::~KhmerBreakEngine() { 6686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete fDictionary; 6696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 6706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t 6726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgKhmerBreakEngine::divideUpDictionaryRange( UText *text, 6736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeStart, 6746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeEnd, 6756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStack &foundBreaks ) const { 6766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 6776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; // Not enough characters for two words 6786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 6796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t wordsFound = 0; 6816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t wordLength; 6826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t current; 6836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 6846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org PossibleWord words[KHMER_LOOKAHEAD]; 6856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 uc; 6866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, rangeStart); 6886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 6906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = 0; 6916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for candidate words at the current position 6936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 6946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 6956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we found exactly one, use that 6966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (candidates == 1) { 6976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 6986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 6996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If there was more than one, see which one can take us forward the most words 7026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else if (candidates > 1) { 7036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we're already at the end of the range, we're done 7046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 7056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 7066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 7086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int wordsMatched = 1; 7096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 7106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordsMatched < 2) { 7116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Followed by another dictionary word; mark first word as a good candidate 7126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 7136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsMatched = 2; 7146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we're already at the end of the range, we're done 7176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 7186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 7196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // See if any of the possible second words is followed by a third word 7226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org do { 7236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If we find a third word, stop right away 7246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 7256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 7266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org goto foundBest; 7276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 7306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 7336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgfoundBest: 7346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 7356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 7366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We come here after having either found a word or not. We look ahead to the 7396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // next word. If it's not a dictionary word, we will combine it with the word we 7406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // just found (if there is one), but only if the preceding word does not exceed 7416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the threshold. 7426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The text iterator should now be positioned at the end of the word we found. 7436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 7446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if it is a dictionary word, do nothing. If it isn't, then if there is 7456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // no preceding word, or the non-word shares less than the minimum threshold 7466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // of characters with a dictionary word, then scan to resynchronize 7476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 7486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org && (wordLength == 0 7496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 7506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look for a plausible word boundary 7516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //TODO: This section will need a rework for UText. 7526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t remaining = rangeEnd - (current+wordLength); 7536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UChar32 pc = utext_current32(text); 7546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t chars = 0; 7556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (;;) { 7566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 7576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uc = utext_current32(text); 7586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Here we're counting on the fact that the SA languages are all 7596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // in the BMP. This should get fixed with the UText rework. 7606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org chars += 1; 7616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (--remaining <= 0) { 7626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 7636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 7656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Maybe. See if it's in the dictionary. 7666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 7676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current+wordLength+chars); 7686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (candidates > 0) { 7696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org break; 7706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pc = uc; 7736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Bump the word count if there wasn't already one 7766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordLength <= 0) { 7776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound += 1; 7786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Update the length with the passed-over characters 7816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += chars; 7826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 7846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Back up to where we were for next iteration 7856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(text, current+wordLength); 7866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Never stop before a combining mark. 7906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t currPos; 7916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 7926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(text); 7936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 7946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 7956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 7966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Look ahead for possible suffixes if a dictionary word does not follow. 7976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We do this in code rather than using a rule so that the heuristic 7986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // resynch continues to function. For example, one of the suffix characters 7996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // could be a typo in the middle of a word. 8006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 8016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 8026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// && fSuffixSet.contains(uc = utext_current32(text))) { 8036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if (uc == KHMER_PAIYANNOI) { 8046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if (!fSuffixSet.contains(utext_previous32(text))) { 8056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// // Skip over previous end and PAIYANNOI 8066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_next32(text); 8076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_next32(text); 8086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// wordLength += 1; // Add PAIYANNOI to word 8096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// uc = utext_current32(text); // Fetch next character 8106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// else { 8126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// // Restore prior position 8136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_next32(text); 8146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if (uc == KHMER_MAIYAMOK) { 8176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if (utext_previous32(text) != KHMER_MAIYAMOK) { 8186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// // Skip over previous end and MAIYAMOK 8196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_next32(text); 8206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_next32(text); 8216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// wordLength += 1; // Add MAIYAMOK to word 8226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// else { 8246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// // Restore prior position 8256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_next32(text); 8266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// else { 8306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// utext_setNativeIndex(text, current+wordLength); 8316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// } 8336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Did we find a word on this iteration? If so, push it on the break stack 8356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (wordLength > 0) { 8366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org foundBreaks.push((current+wordLength), status); 8376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Don't return a break for the end of the dictionary range if there is one there. 8416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (foundBreaks.peeki() >= rangeEnd) { 8426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (void) foundBreaks.popi(); 8436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org wordsFound -= 1; 8446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return wordsFound; 8476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 8486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if !UCONFIG_NO_NORMALIZATION 8506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 8516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ****************************************************************** 8526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * CjkBreakEngine 8536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 8546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const uint32_t kuint32max = 0xFFFFFFFF; 8556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 8566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 8576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Korean dictionary only includes Hangul syllables 8586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 8596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 8606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 8616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 8626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_SUCCESS(status)) { 8646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // handle Korean and Japanese/Chinese using different dictionaries 8656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (type == kKorean) { 8666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setCharacters(fHangulWordSet); 8676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { //Chinese and Japanese 8686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeSet cjSet; 8696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cjSet.addAll(fHanWordSet); 8706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cjSet.addAll(fKatakanaWordSet); 8716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cjSet.addAll(fHiraganaWordSet); 8726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 8736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 8746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org setCharacters(cjSet); 8756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 8776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 8786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCjkBreakEngine::~CjkBreakEngine(){ 8806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org delete fDictionary; 8816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 8826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The katakanaCost values below are based on the length frequencies of all 8846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// katakana phrases in the dictionary 8856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const int kMaxKatakanaLength = 8; 8866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const int kMaxKatakanaGroupLength = 20; 8876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic const uint32_t maxSnlp = 255; 8886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline uint32_t getKatakanaCost(int wordLength){ 8906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //TODO: fill array with actual values from dictionary! 8916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 8926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 8936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 8946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 8956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 8966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstatic inline bool isKatakana(uint16_t value) { 8976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 8986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (value >= 0xFF66u && value <= 0xFF9fu); 8996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 9006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// A very simple helper class to streamline the buffer handling in 9026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// divideUpDictionaryRange. 9036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgtemplate<class T, size_t N> 9046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass AutoBuffer { 9056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 9066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { 9076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (size > N) { 9086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 9096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org capacity = size; 9106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ~AutoBuffer() { 9136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (buffer != stackBuffer) 9146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(buffer); 9156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org T* elems() { 9186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return buffer; 9196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const T& operator[] (size_t i) const { 9226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return buffer[i]; 9236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org T& operator[] (size_t i) { 9266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return buffer[i]; 9276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // resize without copy 9306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org void resize(size_t size) { 9316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (size <= capacity) 9326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return; 9336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (buffer != stackBuffer) 9346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uprv_free(buffer); 9356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 9366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org capacity = size; 9376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprivate: 9406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org T stackBuffer[N]; 9416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org T* buffer; 9426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer(); 9436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org size_t capacity; 9446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 9456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 9486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param text A UText representing the text 9496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param rangeStart The start of the range of dictionary characters 9506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param rangeEnd The end of the range of dictionary characters 9516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param foundBreaks Output of C array of int32_t break positions, or 0 9526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The number of breaks found 9536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 9546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgint32_t 9556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCjkBreakEngine::divideUpDictionaryRange( UText *text, 9566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeStart, 9576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t rangeEnd, 9586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStack &foundBreaks ) const { 9596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (rangeStart >= rangeEnd) { 9606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 9616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const size_t defaultInputLength = 80; 9646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org size_t inputLength = rangeEnd - rangeStart; 9656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Replace by UnicodeString. 9666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<UChar, defaultInputLength> charString(inputLength); 9676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Normalize the input string and put it in normalizedText. 9696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The map from the indices of the normalized input to the raw 9706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // input is kept in charPositions. 9716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UErrorCode status = U_ZERO_ERROR; 9726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); 9736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 9746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 9756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString inputString(charString.elems(), inputLength); 9786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Use Normalizer2. 9796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UNormalizationMode norm_mode = UNORM_NFKC; 9806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool isNormalized = 9816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || 9826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Normalizer::isNormalized(inputString, norm_mode, status); 9836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 9846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Replace by UVector32. 9856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 9866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int numChars = 0; 9876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UText normalizedText = UTEXT_INITIALIZER; 9886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Needs to be declared here because normalizedText holds onto its buffer. 9896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UnicodeString normalizedString; 9906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (isNormalized) { 9916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t index = 0; 9926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org charPositions[0] = 0; 9936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(index < inputString.length()) { 9946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org index = inputString.moveIndex32(index, 1); 9956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org charPositions[++numChars] = index; 9966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_openUnicodeString(&normalizedText, &inputString, &status); 9986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 9996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org else { 10006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); 10016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 10026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 10036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org charPositions.resize(normalizedString.length() + 1); 10056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Normalizer normalizer(charString.elems(), inputLength, norm_mode); 10066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t index = 0; 10076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org charPositions[0] = 0; 10086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while(index < normalizer.endIndex()){ 10096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /* UChar32 uc = */ normalizer.next(); 10106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org charPositions[++numChars] = index = normalizer.getIndex(); 10116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_openUnicodeString(&normalizedText, &normalizedString, &status); 10136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (U_FAILURE(status)) { 10166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 0; 10176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // From this point on, all the indices refer to the indices of 10206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the normalized input string. 10216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // bestSnlp[i] is the snlp of the best segmentation of the first i 10236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // characters in the range to be matched. 10246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Replace by UVector32. 10256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 10266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bestSnlp[0] = 0; 10276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int i = 1; i <= numChars; i++) { 10286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bestSnlp[i] = kuint32max; 10296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // prev[i] is the index of the last CJK character in the previous word in 10326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the best segmentation of the first i characters. 10336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Replace by UVector32. 10346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<int, defaultInputLength> prev(numChars + 1); 10356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for(int i = 0; i <= numChars; i++){ 10366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org prev[i] = -1; 10376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const size_t maxWordSize = 20; 10406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Replace both with UVector32. 10416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<int32_t, maxWordSize> values(numChars); 10426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<int32_t, maxWordSize> lengths(numChars); 10436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Dynamic programming to find the best segmentation. 10456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bool is_prev_katakana = false; 10466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int32_t i = 0; i < numChars; ++i) { 10476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //utext_setNativeIndex(text, rangeStart + i); 10486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(&normalizedText, i); 10496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bestSnlp[i] == kuint32max) 10506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org continue; 10516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t count; 10536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // limit maximum word length matched to size of current substring 10546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); 10556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 10576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // if there are no single character matches found in the dictionary 10596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // starting with this charcter, treat character as a 1-character word 10606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // with the highest value possible, i.e. the least likely to occur. 10616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Exclude Korean characters from this treatment, as they should be left 10626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // together by default. 10636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if((count == 0 || lengths[0] != 1) && 10646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org !fHangulWordSet.contains(utext_current32(&normalizedText))) { 10656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org values[count] = maxSnlp; 10666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org lengths[count++] = 1; 10676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int j = 0; j < count; j++) { 10706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t newSnlp = bestSnlp[i] + values[j]; 10716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (newSnlp < bestSnlp[lengths[j] + i]) { 10726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bestSnlp[lengths[j] + i] = newSnlp; 10736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org prev[lengths[j] + i] = i; 10746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 10776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // In Japanese, 10786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Katakana word in single character is pretty rare. So we apply 10796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the following heuristic to Katakana: any continuous run of Katakana 10806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // characters is considered a candidate word with a default cost 10816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // specified in the katakanaCost table according to its length. 10826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org //utext_setNativeIndex(text, rangeStart + i); 10836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_setNativeIndex(&normalizedText, i); 10846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bool is_katakana = isKatakana(utext_current32(&normalizedText)); 10856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!is_prev_katakana && is_katakana) { 10866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int j = i + 1; 10876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(&normalizedText); 10886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Find the end of the continuous run of Katakana characters 10896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 10906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org isKatakana(utext_current32(&normalizedText))) { 10916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_next32(&normalizedText); 10926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++j; 10936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 10946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if ((j - i) < kMaxKatakanaGroupLength) { 10956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 10966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (newSnlp < bestSnlp[j]) { 10976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bestSnlp[j] = newSnlp; 10986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org prev[j] = i; 10996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org is_prev_katakana = is_katakana; 11036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Start pushing the optimal offset index into t_boundary (t for tentative). 11066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // prev[numChars] is guaranteed to be meaningful. 11076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // We'll first push in the reverse order, i.e., 11086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // t_boundary[0] = numChars, and afterwards do a swap. 11096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO: Replace by UVector32. 11106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); 11116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int numBreaks = 0; 11136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // No segmentation found, set boundary to end of range 11146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (bestSnlp[numChars] == kuint32max) { 11156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org t_boundary[numBreaks++] = numChars; 11166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 11176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int i = numChars; i > 0; i = prev[i]) { 11186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org t_boundary[numBreaks++] = i; 11196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); 11216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Reverse offset index in t_boundary. 11246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Don't add a break for the start of the dictionary range if there is one 11256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // there already. 11266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 11276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org t_boundary[numBreaks++] = 0; 11286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Now that we're done, convert positions in t_bdry[] (indices in 11316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the normalized input string) back to indices in the raw input string 11326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // while reversing t_bdry and pushing values to foundBreaks. 11336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int i = numBreaks-1; i >= 0; i--) { 11346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 11356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 11366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org utext_close(&normalizedText); 11386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return numBreaks; 11396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 11406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 11416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 11436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 11446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 11456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1146