brkeng.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch/* 2b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch ************************************************************************************ 3b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch * Copyright (C) 2006-2013, International Business Machines Corporation 4b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch * and others. All Rights Reserved. 5b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch ************************************************************************************ 6b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch */ 7b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 8b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/utypes.h" 9b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 10b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#if !UCONFIG_NO_BREAK_ITERATION 11b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 12b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "brkeng.h" 13b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "dictbe.h" 14b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/uchar.h" 15b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/uniset.h" 16b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/chariter.h" 17b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/ures.h" 18b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/udata.h" 19b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/putil.h" 20b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/ustring.h" 21b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/uscript.h" 22b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/ucharstrie.h" 23b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "unicode/bytestrie.h" 24b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "charstr.h" 25b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "dictionarydata.h" 26b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "uvector.h" 27b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "umutex.h" 28b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "uresimp.h" 29b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch#include "ubrkimpl.h" 30b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 31b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochU_NAMESPACE_BEGIN 32b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 33b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch/* 34b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch ****************************************************************** 35b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch */ 36b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 37b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochLanguageBreakEngine::LanguageBreakEngine() { 38b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 39b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 40b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochLanguageBreakEngine::~LanguageBreakEngine() { 41b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 42b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 43b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch/* 44537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch ****************************************************************** 45537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch */ 46537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch 47537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben MurdochLanguageBreakFactory::LanguageBreakFactory() { 48537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch} 49537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch 50b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochLanguageBreakFactory::~LanguageBreakFactory() { 51b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 52b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 53537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch/* 54537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch ****************************************************************** 55537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch */ 56537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch 57537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben MurdochUnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 58537ba893e2530051ec7f296e769fdd37bb4ae4a0Ben Murdoch for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 59b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch fHandled[i] = 0; 60b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 61b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 62b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 63b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUnhandledEngine::~UnhandledEngine() { 64b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 65b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (fHandled[i] != 0) { 66b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch delete fHandled[i]; 67b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 68b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 69b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 70b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 71b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUBool 72b8a8cc1952d61a2f3a2568848933943a543b5d3eBen MurdochUnhandledEngine::handles(UChar32 c, int32_t breakType) const { 73b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 74b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 75b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch} 76b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 77int32_t 78UnhandledEngine::findBreaks( UText *text, 79 int32_t startPos, 80 int32_t endPos, 81 UBool reverse, 82 int32_t breakType, 83 UStack &/*foundBreaks*/ ) const { 84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 85 UChar32 c = utext_current32(text); 86 if (reverse) { 87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 88 c = utext_previous32(text); 89 } 90 } 91 else { 92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 93 utext_next32(text); // TODO: recast loop to work with post-increment operations. 94 c = utext_current32(text); 95 } 96 } 97 } 98 return 0; 99} 100 101void 102UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 104 if (fHandled[breakType] == 0) { 105 fHandled[breakType] = new UnicodeSet(); 106 if (fHandled[breakType] == 0) { 107 return; 108 } 109 } 110 if (!fHandled[breakType]->contains(c)) { 111 UErrorCode status = U_ZERO_ERROR; 112 // Apply the entire script of the character. 113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 115 } 116 } 117} 118 119/* 120 ****************************************************************** 121 */ 122 123ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 124 fEngines = 0; 125} 126 127ICULanguageBreakFactory::~ICULanguageBreakFactory() { 128 if (fEngines != 0) { 129 delete fEngines; 130 } 131} 132 133U_NAMESPACE_END 134U_CDECL_BEGIN 135static void U_CALLCONV _deleteEngine(void *obj) { 136 delete (const icu::LanguageBreakEngine *) obj; 137} 138U_CDECL_END 139U_NAMESPACE_BEGIN 140 141const LanguageBreakEngine * 142ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 143 UBool needsInit; 144 int32_t i; 145 const LanguageBreakEngine *lbe = NULL; 146 UErrorCode status = U_ZERO_ERROR; 147 148 // TODO: The global mutex should not be used. 149 // The global mutex should only be used for short periods. 150 // A ICULanguageBreakFactory specific mutex should be used. 151 umtx_lock(NULL); 152 needsInit = (UBool)(fEngines == NULL); 153 if (!needsInit) { 154 i = fEngines->size(); 155 while (--i >= 0) { 156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 157 if (lbe != NULL && lbe->handles(c, breakType)) { 158 break; 159 } 160 lbe = NULL; 161 } 162 } 163 umtx_unlock(NULL); 164 165 if (lbe != NULL) { 166 return lbe; 167 } 168 169 if (needsInit) { 170 UStack *engines = new UStack(_deleteEngine, NULL, status); 171 if (U_SUCCESS(status) && engines == NULL) { 172 status = U_MEMORY_ALLOCATION_ERROR; 173 } 174 else if (U_FAILURE(status)) { 175 delete engines; 176 engines = NULL; 177 } 178 else { 179 umtx_lock(NULL); 180 if (fEngines == NULL) { 181 fEngines = engines; 182 engines = NULL; 183 } 184 umtx_unlock(NULL); 185 delete engines; 186 } 187 } 188 189 if (fEngines == NULL) { 190 return NULL; 191 } 192 193 // We didn't find an engine the first time through, or there was no 194 // stack. Create an engine. 195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); 196 197 // Now get the lock, and see if someone else has created it in the 198 // meantime 199 umtx_lock(NULL); 200 i = fEngines->size(); 201 while (--i >= 0) { 202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 203 if (lbe != NULL && lbe->handles(c, breakType)) { 204 break; 205 } 206 lbe = NULL; 207 } 208 if (lbe == NULL && newlbe != NULL) { 209 fEngines->push((void *)newlbe, status); 210 lbe = newlbe; 211 newlbe = NULL; 212 } 213 umtx_unlock(NULL); 214 215 delete newlbe; 216 217 return lbe; 218} 219 220const LanguageBreakEngine * 221ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 222 UErrorCode status = U_ZERO_ERROR; 223 UScriptCode code = uscript_getScript(c, &status); 224 if (U_SUCCESS(status)) { 225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); 226 if (m != NULL) { 227 const LanguageBreakEngine *engine = NULL; 228 switch(code) { 229 case USCRIPT_THAI: 230 engine = new ThaiBreakEngine(m, status); 231 break; 232 case USCRIPT_LAO: 233 engine = new LaoBreakEngine(m, status); 234 break; 235 case USCRIPT_KHMER: 236 engine = new KhmerBreakEngine(m, status); 237 break; 238 239#if !UCONFIG_NO_NORMALIZATION 240 // CJK not available w/o normalization 241 case USCRIPT_HANGUL: 242 engine = new CjkBreakEngine(m, kKorean, status); 243 break; 244 245 // use same BreakEngine and dictionary for both Chinese and Japanese 246 case USCRIPT_HIRAGANA: 247 case USCRIPT_KATAKANA: 248 case USCRIPT_HAN: 249 engine = new CjkBreakEngine(m, kChineseJapanese, status); 250 break; 251#if 0 252 // TODO: Have to get some characters with script=common handled 253 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 254 // them to CjkBreakEngine does not work. The engine has to 255 // special-case them. 256 case USCRIPT_COMMON: 257 { 258 UBlockCode block = ublock_getCode(code); 259 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 260 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 261 break; 262 } 263#endif 264#endif 265 266 default: 267 break; 268 } 269 if (engine == NULL) { 270 delete m; 271 } 272 else if (U_FAILURE(status)) { 273 delete engine; 274 engine = NULL; 275 } 276 return engine; 277 } 278 } 279 return NULL; 280} 281 282DictionaryMatcher * 283ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 284 UErrorCode status = U_ZERO_ERROR; 285 // open root from brkitr tree. 286 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 287 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 288 int32_t dictnlength = 0; 289 const UChar *dictfname = 290 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 291 if (U_FAILURE(status)) { 292 ures_close(b); 293 return NULL; 294 } 295 CharString dictnbuf; 296 CharString ext; 297 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 298 if (extStart != NULL) { 299 int32_t len = (int32_t)(extStart - dictfname); 300 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 301 dictnlength = len; 302 } 303 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 304 ures_close(b); 305 306 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 307 if (U_SUCCESS(status)) { 308 // build trie 309 const uint8_t *data = (const uint8_t *)udata_getMemory(file); 310 const int32_t *indexes = (const int32_t *)data; 311 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 312 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 313 DictionaryMatcher *m = NULL; 314 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 315 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 316 const char *characters = (const char *)(data + offset); 317 m = new BytesDictionaryMatcher(characters, transform, file); 318 } 319 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 320 const UChar *characters = (const UChar *)(data + offset); 321 m = new UCharsDictionaryMatcher(characters, file); 322 } 323 if (m == NULL) { 324 // no matcher exists to take ownership - either we are an invalid 325 // type or memory allocation failed 326 udata_close(file); 327 } 328 return m; 329 } else if (dictfname != NULL) { 330 // we don't have a dictionary matcher. 331 // returning NULL here will cause us to fail to find a dictionary break engine, as expected 332 status = U_ZERO_ERROR; 333 return NULL; 334 } 335 return NULL; 336} 337 338U_NAMESPACE_END 339 340#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 341