filteredbrk.cpp revision c14898b482f76ecab9026615e2e4c6fe78358bdc
1/* 2******************************************************************************* 3* Copyright (C) 2014-2015, International Business Machines Corporation and 4* others. All Rights Reserved. 5******************************************************************************* 6*/ 7 8#include "unicode/utypes.h" 9#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION 10 11#include "cmemory.h" 12 13#include "unicode/filteredbrk.h" 14#include "unicode/ucharstriebuilder.h" 15#include "unicode/ures.h" 16 17#include "uresimp.h" // ures_getByKeyWithFallback 18#include "ubrkimpl.h" // U_ICUDATA_BRKITR 19#include "uvector.h" 20#include "cmemory.h" 21 22U_NAMESPACE_BEGIN 23 24#ifndef FB_DEBUG 25#define FB_DEBUG 0 26#endif 27 28#if FB_DEBUG 29#include <stdio.h> 30static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { 31 char buf[2048]; 32 if(s) { 33 s->extract(0,s->length(),buf,2048); 34 } else { 35 strcpy(buf,"NULL"); 36 } 37 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n", 38 f, l, m, buf, (const void*)s, b?'T':'F',(int)d); 39} 40 41#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) 42#else 43#define FB_TRACE(m,s,b,d) 44#endif 45 46/** 47 * Used with sortedInsert() 48 */ 49static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 50 const UnicodeString &a = *(const UnicodeString*)t1.pointer; 51 const UnicodeString &b = *(const UnicodeString*)t2.pointer; 52 return a.compare(b); 53} 54 55/** 56 * A UVector which implements a set of strings. 57 */ 58class U_COMMON_API UStringSet : public UVector { 59 public: 60 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, 61 uhash_compareUnicodeString, 62 1, 63 status) {} 64 virtual ~UStringSet(); 65 /** 66 * Is this UnicodeSet contained? 67 */ 68 inline UBool contains(const UnicodeString& s) { 69 return contains((void*) &s); 70 } 71 using UVector::contains; 72 /** 73 * Return the ith UnicodeString alias 74 */ 75 inline const UnicodeString* getStringAt(int32_t i) const { 76 return (const UnicodeString*)elementAt(i); 77 } 78 /** 79 * Adopt the UnicodeString if not already contained. 80 * Caller no longer owns the pointer in any case. 81 * @return true if adopted successfully, false otherwise (error, or else duplicate) 82 */ 83 inline UBool adopt(UnicodeString *str, UErrorCode &status) { 84 if(U_FAILURE(status) || contains(*str)) { 85 delete str; 86 return false; 87 } else { 88 sortedInsert(str, compareUnicodeString, status); 89 if(U_FAILURE(status)) { 90 delete str; 91 return false; 92 } 93 return true; 94 } 95 } 96 /** 97 * Add by value. 98 * @return true if successfully adopted. 99 */ 100 inline UBool add(const UnicodeString& str, UErrorCode &status) { 101 if(U_FAILURE(status)) return false; 102 UnicodeString *t = new UnicodeString(str); 103 if(t==NULL) { 104 status = U_MEMORY_ALLOCATION_ERROR; return false; 105 } 106 return adopt(t, status); 107 } 108 /** 109 * Remove this string. 110 * @return true if successfully removed, false otherwise (error, or else it wasn't there) 111 */ 112 inline UBool remove(const UnicodeString &s, UErrorCode &status) { 113 if(U_FAILURE(status)) return false; 114 return removeElement((void*) &s); 115 } 116}; 117 118/** 119 * Virtual, won't be inlined 120 */ 121UStringSet::~UStringSet() {} 122 123/* ----------------------------------------------------------- */ 124 125 126/* Filtered Break constants */ 127static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie 128static const int32_t kMATCH = (1<<1); //< exact match - skip this one. 129static const int32_t kSuppressInReverse = (1<<0); 130static const int32_t kAddToForward = (1<<1); 131static const UChar kFULLSTOP = 0x002E; // '.' 132 133/** 134 * Shared data for SimpleFilteredSentenceBreakIterator 135 */ 136class SimpleFilteredSentenceBreakData : public UMemory { 137public: 138 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) 139 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } 140 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } 141 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } 142 virtual ~SimpleFilteredSentenceBreakData(); 143 144 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." 145 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. 146 int32_t refcount; 147}; 148 149SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} 150 151/** 152 * Concrete implementation 153 */ 154class SimpleFilteredSentenceBreakIterator : public BreakIterator { 155public: 156 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); 157 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); 158 virtual ~SimpleFilteredSentenceBreakIterator(); 159private: 160 SimpleFilteredSentenceBreakData *fData; 161 LocalPointer<BreakIterator> fDelegate; 162 LocalUTextPointer fText; 163 164 /* -- subclass interface -- */ 165public: 166 /* -- cloning and other subclass stuff -- */ 167 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, 168 int32_t &/*BufferSize*/, 169 UErrorCode &status) { 170 // for now - always deep clone 171 status = U_SAFECLONE_ALLOCATED_WARNING; 172 return clone(); 173 } 174 virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); } 175 virtual UClassID getDynamicClassID(void) const { return NULL; } 176 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; } 177 178 /* -- text modifying -- */ 179 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } 180 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } 181 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } 182 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } 183 184 /* -- other functions that are just delegated -- */ 185 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } 186 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } 187 188 /* -- ITERATION -- */ 189 virtual int32_t first(void); 190 virtual int32_t preceding(int32_t offset); 191 virtual int32_t previous(void); 192 virtual UBool isBoundary(int32_t offset); 193 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. 194 195 virtual int32_t next(void); 196 197 virtual int32_t next(int32_t n); 198 virtual int32_t following(int32_t offset); 199 virtual int32_t last(void); 200 201private: 202 /** 203 * Given that the fDelegate has already given its "initial" answer, 204 * find the NEXT actual (non-excepted) break. 205 * @param n initial position from delegate 206 * @return new break position or UBRK_DONE 207 */ 208 int32_t internalNext(int32_t n); 209 /** 210 * Given that the fDelegate has already given its "initial" answer, 211 * find the PREV actual (non-excepted) break. 212 * @param n initial position from delegate 213 * @return new break position or UBRK_DONE 214 */ 215 int32_t internalPrev(int32_t n); 216 /** 217 * set up the UText with the value of the fDelegate. 218 * Call this before calling breakExceptionAt. 219 * May be able to avoid excess calls 220 */ 221 void resetState(UErrorCode &status); 222 /** 223 * Is there a match (exception) at this spot? 224 */ 225 enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; 226 /** 227 * Determine if there is an exception at this spot 228 * @param n spot to check 229 * @return kNoExceptionHere or kExceptionHere 230 **/ 231 enum EFBMatchResult breakExceptionAt(int32_t n); 232}; 233 234SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) 235 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) 236{ 237} 238 239 240SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : 241 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), 242 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), 243 fDelegate(adopt) 244{ 245 // all set.. 246} 247 248SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { 249 fData = fData->decr(); 250} 251 252void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { 253 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); 254} 255 256SimpleFilteredSentenceBreakIterator::EFBMatchResult 257SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { 258 int64_t bestPosn = -1; 259 int32_t bestValue = -1; 260 // loops while 'n' points to an exception. 261 utext_setNativeIndex(fText.getAlias(), n); // from n.. 262 fData->fBackwardsTrie->reset(); 263 UChar32 uch; 264 265 //if(debug2) u_printf(" n@ %d\n", n); 266 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 267 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? 268 // TODO only do this the 1st time? 269 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); 270 } else { 271 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); 272 uch = utext_next32(fText.getAlias()); 273 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); 274 } 275 276 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; 277 278 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. 279 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie 280 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far 281 bestPosn = utext_getNativeIndex(fText.getAlias()); 282 bestValue = fData->fBackwardsTrie->getValue(); 283 } 284 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); 285 } 286 287 if(USTRINGTRIE_MATCHES(r)) { // exact match? 288 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 289 bestValue = fData->fBackwardsTrie->getValue(); 290 bestPosn = utext_getNativeIndex(fText.getAlias()); 291 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 292 } 293 294 if(bestPosn>=0) { 295 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 296 297 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? 298 //int32_t bestValue = fBackwardsTrie->getValue(); 299 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); 300 301 if(bestValue == kMATCH) { // exact match! 302 //if(debug2) u_printf(" exact backward match\n"); 303 return kExceptionHere; // See if the next is another exception. 304 } else if(bestValue == kPARTIAL 305 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie 306 //if(debug2) u_printf(" partial backward match\n"); 307 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 308 // to see if it matches something going forward. 309 fData->fForwardsPartialTrie->reset(); 310 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; 311 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. 312 //if(debug2) u_printf("Retrying at %d\n", bestPosn); 313 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && 314 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { 315 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); 316 } 317 if(USTRINGTRIE_MATCHES(rfwd)) { 318 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); 319 // only full matches here, nothing to check 320 // skip the next: 321 return kExceptionHere; 322 } else { 323 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); 324 // no match (no exception) -return the 'underlying' break 325 return kNoExceptionHere; 326 } 327 } else { 328 return kNoExceptionHere; // internal error and/or no forwards trie 329 } 330 } else { 331 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match 332 return kNoExceptionHere; // No match - so exit. Not an exception. 333 } 334} 335 336// the workhorse single next. 337int32_t 338SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { 339 if(n == UBRK_DONE || // at end or 340 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 341 return n; 342 } 343 // OK, do we need to break here? 344 UErrorCode status = U_ZERO_ERROR; 345 // refresh text 346 resetState(status); 347 if(U_FAILURE(status)) return UBRK_DONE; // bail out 348 int64_t utextLen = utext_nativeLength(fText.getAlias()); 349 350 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 351 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). 352 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 353 354 switch(m) { 355 case kExceptionHere: 356 n = fDelegate->next(); // skip this one. Find the next lowerlevel break. 357 continue; 358 359 default: 360 case kNoExceptionHere: 361 return n; 362 } 363 } 364 return n; 365} 366 367int32_t 368SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { 369 if(n == 0 || n == UBRK_DONE || // at end or 370 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 371 return n; 372 } 373 // OK, do we need to break here? 374 UErrorCode status = U_ZERO_ERROR; 375 // refresh text 376 resetState(status); 377 if(U_FAILURE(status)) return UBRK_DONE; // bail out 378 379 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 380 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). 381 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 382 383 switch(m) { 384 case kExceptionHere: 385 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. 386 continue; 387 388 default: 389 case kNoExceptionHere: 390 return n; 391 } 392 } 393 return n; 394} 395 396 397int32_t 398SimpleFilteredSentenceBreakIterator::next() { 399 return internalNext(fDelegate->next()); 400} 401 402int32_t 403SimpleFilteredSentenceBreakIterator::first(void) { 404 return internalNext(fDelegate->first()); 405} 406 407int32_t 408SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { 409 return internalPrev(fDelegate->preceding(offset)); 410} 411 412int32_t 413SimpleFilteredSentenceBreakIterator::previous(void) { 414 return internalPrev(fDelegate->previous()); 415} 416 417UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { 418 if(!fDelegate->isBoundary(offset)) return false; // no break to suppress 419 420 UErrorCode status = U_ZERO_ERROR; 421 resetState(status); 422 423 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); 424 425 switch(m) { 426 case kExceptionHere: 427 return false; 428 default: 429 case kNoExceptionHere: 430 return true; 431 } 432} 433 434int32_t 435SimpleFilteredSentenceBreakIterator::next(int32_t offset) { 436 return internalNext(fDelegate->next(offset)); 437} 438 439int32_t 440SimpleFilteredSentenceBreakIterator::following(int32_t offset) { 441 return internalNext(fDelegate->following(offset)); 442} 443 444int32_t 445SimpleFilteredSentenceBreakIterator::last(void) { 446 // Don't suppress a break opportunity at the end of text. 447 return fDelegate->last(); 448} 449 450 451/** 452 * Concrete implementation of builder class. 453 */ 454class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { 455public: 456 virtual ~SimpleFilteredBreakIteratorBuilder(); 457 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); 458 SimpleFilteredBreakIteratorBuilder(UErrorCode &status); 459 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 460 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 461 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); 462private: 463 UStringSet fSet; 464}; 465 466SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() 467{ 468} 469 470SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) 471 : fSet(status) 472{ 473} 474 475SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) 476 : fSet(status) 477{ 478 if(U_SUCCESS(status)) { 479 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status)); 480 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status)); 481 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status)); 482 if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it. 483 484 LocalUResourceBundlePointer strs; 485 UErrorCode subStatus = status; 486 do { 487 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); 488 if(strs.isValid() && U_SUCCESS(subStatus)) { 489 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); 490 suppressBreakAfter(str, status); // load the string 491 } 492 } while (strs.isValid() && U_SUCCESS(subStatus)); 493 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { 494 status = subStatus; 495 } 496 } 497} 498 499UBool 500SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 501{ 502 UBool r = fSet.add(exception, status); 503 FB_TRACE("suppressBreakAfter",&exception,r,0); 504 return r; 505} 506 507UBool 508SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 509{ 510 UBool r = fSet.remove(exception, status); 511 FB_TRACE("unsuppressBreakAfter",&exception,r,0); 512 return r; 513} 514 515/** 516 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. 517 * Work around this. 518 * 519 * Note: "new UnicodeString[subCount]" ends up calling global operator new 520 * on MSVC2012 for some reason. 521 */ 522static inline UnicodeString* newUnicodeStringArray(size_t count) { 523 return new UnicodeString[count ? count : 1]; 524} 525 526BreakIterator * 527SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { 528 LocalPointer<BreakIterator> adopt(adoptBreakIterator); 529 530 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); 531 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); 532 if(U_FAILURE(status)) { 533 return NULL; 534 } 535 536 int32_t revCount = 0; 537 int32_t fwdCount = 0; 538 539 int32_t subCount = fSet.size(); 540 541 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); 542 543 LocalArray<UnicodeString> ustrs(ustrs_ptr); 544 545 LocalMemory<int> partials; 546 partials.allocateInsteadAndReset(subCount); 547 548 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. 549 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." 550 551 int n=0; 552 for ( int32_t i = 0; 553 i<fSet.size(); 554 i++) { 555 const UnicodeString *abbr = fSet.getStringAt(i); 556 if(abbr) { 557 FB_TRACE("build",abbr,TRUE,i); 558 ustrs[n] = *abbr; // copy by value 559 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i); 560 } else { 561 FB_TRACE("build",abbr,FALSE,i); 562 status = U_MEMORY_ALLOCATION_ERROR; 563 return NULL; 564 } 565 partials[n] = 0; // default: not partial 566 n++; 567 } 568 // first pass - find partials. 569 for(int i=0;i<subCount;i++) { 570 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations 571 if(nn>-1 && (nn+1)!=ustrs[i].length()) { 572 FB_TRACE("partial",&ustrs[i],FALSE,i); 573 // is partial. 574 // is it unique? 575 int sameAs = -1; 576 for(int j=0;j<subCount;j++) { 577 if(j==i) continue; 578 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { 579 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1); 580 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn 581 if(partials[j]==0) { // hasn't been processed yet 582 partials[j] = kSuppressInReverse | kAddToForward; 583 FB_TRACE("suppressing",&ustrs[j],FALSE,j); 584 } else if(partials[j] & kSuppressInReverse) { 585 sameAs = j; // the other entry is already in the reverse table. 586 } 587 } 588 } 589 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs); 590 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]); 591 UnicodeString prefix(ustrs[i], 0, nn+1); 592 if(sameAs == -1 && partials[i] == 0) { 593 // first one - add the prefix to the reverse table. 594 prefix.reverse(); 595 builder->add(prefix, kPARTIAL, status); 596 revCount++; 597 FB_TRACE("Added partial",&prefix,FALSE, i); 598 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); 599 partials[i] = kSuppressInReverse | kAddToForward; 600 } else { 601 FB_TRACE("NOT adding partial",&prefix,FALSE, i); 602 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); 603 } 604 } 605 } 606 for(int i=0;i<subCount;i++) { 607 if(partials[i]==0) { 608 ustrs[i].reverse(); 609 builder->add(ustrs[i], kMATCH, status); 610 revCount++; 611 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); 612 } else { 613 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i); 614 615 // an optimization would be to only add the portion after the '.' 616 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, 617 // instead of "Ph.D." since we already know the "Ph." part is a match. 618 // would need the trie to be able to hold 0-length strings, though. 619 builder2->add(ustrs[i], kMATCH, status); // forward 620 fwdCount++; 621 //ustrs[i].reverse(); 622 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); 623 } 624 } 625 FB_TRACE("AbbrCount",NULL,FALSE, subCount); 626 627 if(revCount>0) { 628 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); 629 if(U_FAILURE(status)) { 630 FB_TRACE(u_errorName(status),NULL,FALSE, -1); 631 return NULL; 632 } 633 } 634 635 if(fwdCount>0) { 636 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); 637 if(U_FAILURE(status)) { 638 FB_TRACE(u_errorName(status),NULL,FALSE, -1); 639 return NULL; 640 } 641 } 642 643 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); 644} 645 646 647// ----------- Base class implementation 648 649FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { 650} 651 652FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { 653} 654 655FilteredBreakIteratorBuilder * 656FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { 657 if(U_FAILURE(status)) return NULL; 658 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); 659 return (U_SUCCESS(status))? ret.orphan(): NULL; 660} 661 662FilteredBreakIteratorBuilder * 663FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { 664 if(U_FAILURE(status)) return NULL; 665 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); 666 return (U_SUCCESS(status))? ret.orphan(): NULL; 667} 668 669U_NAMESPACE_END 670 671#endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION 672