10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 5c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert* Copyright (C) 2014-2015, International Business Machines Corporation and 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* others. All Rights Reserved. 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 10f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "unicode/utypes.h" 11f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 13f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "cmemory.h" 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 15f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "unicode/filteredbrk.h" 16f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "unicode/ucharstriebuilder.h" 17f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "unicode/ures.h" 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 19f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "uresimp.h" // ures_getByKeyWithFallback 20f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "ubrkimpl.h" // U_ICUDATA_BRKITR 21f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "uvector.h" 22f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "cmemory.h" 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 26f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#ifndef FB_DEBUG 27f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#define FB_DEBUG 0 28f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#endif 29f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 30f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#if FB_DEBUG 31f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include <stdio.h> 32f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstatic void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { 33f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius char buf[2048]; 34f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(s) { 35f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius s->extract(0,s->length(),buf,2048); 36f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } else { 37f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius strcpy(buf,"NULL"); 38f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 39f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n", 40f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius f, l, m, buf, (const void*)s, b?'T':'F',(int)d); 41f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 42f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 43f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) 44f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#else 45f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#define FB_TRACE(m,s,b,d) 46f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#endif 47f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 48c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert/** 49c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Used with sortedInsert() 50c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 51f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstatic int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 52f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const UnicodeString &a = *(const UnicodeString*)t1.pointer; 53f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const UnicodeString &b = *(const UnicodeString*)t2.pointer; 54f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return a.compare(b); 55f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 56f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 57f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/** 58f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * A UVector which implements a set of strings. 59f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 60c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertclass U_COMMON_API UStringSet : public UVector { 61f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius public: 62f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, 63f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius uhash_compareUnicodeString, 64f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1, 65f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius status) {} 66f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual ~UStringSet(); 67f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 68f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Is this UnicodeSet contained? 69f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 70f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius inline UBool contains(const UnicodeString& s) { 71f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return contains((void*) &s); 72f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 73f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius using UVector::contains; 74f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 75f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Return the ith UnicodeString alias 76f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 77f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius inline const UnicodeString* getStringAt(int32_t i) const { 78f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return (const UnicodeString*)elementAt(i); 79f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 80f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 81f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Adopt the UnicodeString if not already contained. 82f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Caller no longer owns the pointer in any case. 83f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @return true if adopted successfully, false otherwise (error, or else duplicate) 84f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 85f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius inline UBool adopt(UnicodeString *str, UErrorCode &status) { 86f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(U_FAILURE(status) || contains(*str)) { 87f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete str; 88f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return false; 89f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } else { 90f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sortedInsert(str, compareUnicodeString, status); 91f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(U_FAILURE(status)) { 92f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius delete str; 93f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return false; 94f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 95f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return true; 96f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 97f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 98f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 99f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Add by value. 100f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @return true if successfully adopted. 101f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 102f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius inline UBool add(const UnicodeString& str, UErrorCode &status) { 103f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(U_FAILURE(status)) return false; 104f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeString *t = new UnicodeString(str); 105f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(t==NULL) { 106f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius status = U_MEMORY_ALLOCATION_ERROR; return false; 107f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 108f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return adopt(t, status); 109f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 110f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /** 111f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Remove this string. 112f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * @return true if successfully removed, false otherwise (error, or else it wasn't there) 113f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 114f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius inline UBool remove(const UnicodeString &s, UErrorCode &status) { 115f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(U_FAILURE(status)) return false; 116f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return removeElement((void*) &s); 117f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 118f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius}; 119f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 120f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/** 121f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Virtual, won't be inlined 122f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 123f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusUStringSet::~UStringSet() {} 124f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 125c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert/* ----------------------------------------------------------- */ 126c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 128c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert/* Filtered Break constants */ 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t kMATCH = (1<<1); //< exact match - skip this one. 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t kSuppressInReverse = (1<<0); 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t kAddToForward = (1<<1); 133c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertstatic const UChar kFULLSTOP = 0x002E; // '.' 134c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 135c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert/** 136c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Shared data for SimpleFilteredSentenceBreakIterator 137c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 138c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertclass SimpleFilteredSentenceBreakData : public UMemory { 139c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertpublic: 140c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) 141c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } 142c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } 143c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } 144c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual ~SimpleFilteredSentenceBreakData(); 145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 146c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." 147c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. 148c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int32_t refcount; 149c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert}; 150c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 151c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} 152c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 153c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert/** 154c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Concrete implementation 155c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 156f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusclass SimpleFilteredSentenceBreakIterator : public BreakIterator { 157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 158f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); 159f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); 160f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual ~SimpleFilteredSentenceBreakIterator(); 161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 162c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakData *fData; 163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<BreakIterator> fDelegate; 164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalUTextPointer fText; 165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* -- subclass interface -- */ 167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* -- cloning and other subclass stuff -- */ 169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, 170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t &/*BufferSize*/, 171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &status) { 172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for now - always deep clone 173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius status = U_SAFECLONE_ALLOCATED_WARNING; 174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return clone(); 175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 176f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); } 177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual UClassID getDynamicClassID(void) const { return NULL; } 1781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; } 179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* -- text modifying -- */ 181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } 182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } 183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } 184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } 185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* -- other functions that are just delegated -- */ 187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } 188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } 189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /* -- ITERATION -- */ 191c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t first(void); 192c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t preceding(int32_t offset); 193c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t previous(void); 194c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual UBool isBoundary(int32_t offset); 195c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. 196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual int32_t next(void); 198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 199c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t next(int32_t n); 200c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t following(int32_t offset); 201c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert virtual int32_t last(void); 202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 203c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertprivate: 204c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert /** 205c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Given that the fDelegate has already given its "initial" answer, 206c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * find the NEXT actual (non-excepted) break. 207c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * @param n initial position from delegate 208c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * @return new break position or UBRK_DONE 209c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 210c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int32_t internalNext(int32_t n); 211c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert /** 212c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Given that the fDelegate has already given its "initial" answer, 213c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * find the PREV actual (non-excepted) break. 214c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * @param n initial position from delegate 215c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * @return new break position or UBRK_DONE 216c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 217c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int32_t internalPrev(int32_t n); 218c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert /** 219c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * set up the UText with the value of the fDelegate. 220c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Call this before calling breakExceptionAt. 221c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * May be able to avoid excess calls 222c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 223c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert void resetState(UErrorCode &status); 224c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert /** 225c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Is there a match (exception) at this spot? 226c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert */ 227c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; 228c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert /** 229c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * Determine if there is an exception at this spot 230c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * @param n spot to check 231c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert * @return kNoExceptionHere or kExceptionHere 232c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert **/ 233c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert enum EFBMatchResult breakExceptionAt(int32_t n); 234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 236f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusSimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) 237c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) 238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 242f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusSimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : 243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), 244c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), 245c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fDelegate(adopt) 246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // all set.. 248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 250c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { 251c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fData = fData->decr(); 252c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 253f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 254c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertvoid SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { 255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); 256c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 257c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 258c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::EFBMatchResult 259c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { 260c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int64_t bestPosn = -1; 261c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int32_t bestValue = -1; 262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // loops while 'n' points to an exception. 263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utext_setNativeIndex(fText.getAlias(), n); // from n.. 264c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fData->fBackwardsTrie->reset(); 265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 uch; 266c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf(" n@ %d\n", n); 268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? 270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // TODO only do this the 1st time? 271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); 272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); 274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uch = utext_next32(fText.getAlias()); 275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); 276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 278c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; 279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. 281c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie 282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far 283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius bestPosn = utext_getNativeIndex(fText.getAlias()); 284c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert bestValue = fData->fBackwardsTrie->getValue(); 285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); 287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(USTRINGTRIE_MATCHES(r)) { // exact match? 290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 291c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert bestValue = fData->fBackwardsTrie->getValue(); 292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius bestPosn = utext_getNativeIndex(fText.getAlias()); 293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(bestPosn>=0) { 297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); 298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? 300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //int32_t bestValue = fBackwardsTrie->getValue(); 301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); 302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(bestValue == kMATCH) { // exact match! 304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf(" exact backward match\n"); 305c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return kExceptionHere; // See if the next is another exception. 306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(bestValue == kPARTIAL 307c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf(" partial backward match\n"); 309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // to see if it matches something going forward. 311c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fData->fForwardsPartialTrie->reset(); 312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; 313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. 314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("Retrying at %d\n", bestPosn); 315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && 316c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { 317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); 318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(USTRINGTRIE_MATCHES(rfwd)) { 320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); 321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // only full matches here, nothing to check 322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // skip the next: 323c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return kExceptionHere; 324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); 326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // no match (no exception) -return the 'underlying' break 327c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return kNoExceptionHere; 328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 330c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return kNoExceptionHere; // internal error and/or no forwards trie 331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match 334c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return kNoExceptionHere; // No match - so exit. Not an exception. 335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 336c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 337c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 338c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert// the workhorse single next. 339c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 340c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { 341c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if(n == UBRK_DONE || // at end or 342c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 343c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return n; 344c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 345c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // OK, do we need to break here? 346c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UErrorCode status = U_ZERO_ERROR; 347c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // refresh text 348c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert resetState(status); 349c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if(U_FAILURE(status)) return UBRK_DONE; // bail out 350c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert int64_t utextLen = utext_nativeLength(fText.getAlias()); 351c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 352c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 353c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). 354c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 355c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 356c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert switch(m) { 357c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert case kExceptionHere: 358c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert n = fDelegate->next(); // skip this one. Find the next lowerlevel break. 359c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert continue; 360c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 361c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert default: 362c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert case kNoExceptionHere: 363c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return n; 364c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 365c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return n; 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 369c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 370c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { 371c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if(n == 0 || n == UBRK_DONE || // at end or 372c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions 373c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return n; 374c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 375c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // OK, do we need to break here? 376c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UErrorCode status = U_ZERO_ERROR; 377c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // refresh text 378c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert resetState(status); 379c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert if(U_FAILURE(status)) return UBRK_DONE; // bail out 380c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 381c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 382c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). 383c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 384c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 385c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert switch(m) { 386c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert case kExceptionHere: 387c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. 388c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert continue; 389c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 390c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert default: 391c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert case kNoExceptionHere: 392c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return n; 393c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 394c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 395c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return n; 396c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 397c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 398c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 399c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 400c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::next() { 401c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return internalNext(fDelegate->next()); 402c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 403c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 404c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 405c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::first(void) { 40664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert // Don't suppress a break opportunity at the beginning of text. 40764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert return fDelegate->first(); 408c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 409c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 410c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 411c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { 412c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return internalPrev(fDelegate->preceding(offset)); 413c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 414c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 415c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 416c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::previous(void) { 417c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return internalPrev(fDelegate->previous()); 418c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 419c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 420c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertUBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { 42164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (!fDelegate->isBoundary(offset)) return false; // no break to suppress 42264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert 42364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions 424c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 425c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert UErrorCode status = U_ZERO_ERROR; 426c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert resetState(status); 427c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 428c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); 429c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 430c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert switch(m) { 431c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert case kExceptionHere: 432c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return false; 433c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert default: 434c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert case kNoExceptionHere: 435c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return true; 436c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert } 437c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 438c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 439c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 440c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::next(int32_t offset) { 441c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return internalNext(fDelegate->next(offset)); 442c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 443c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 444c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 445c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::following(int32_t offset) { 446c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return internalNext(fDelegate->following(offset)); 447c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 448c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 449c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertint32_t 450c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik RoubertSimpleFilteredSentenceBreakIterator::last(void) { 451c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert // Don't suppress a break opportunity at the end of text. 452c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return fDelegate->last(); 453c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert} 454c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 455c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert 456f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/** 457f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Concrete implementation of builder class. 458f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 459c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubertclass U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { 460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~SimpleFilteredBreakIteratorBuilder(); 462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); 463f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius SimpleFilteredBreakIteratorBuilder(UErrorCode &status); 464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); 466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); 467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 468f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UStringSet fSet; 469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusSimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() 472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 475f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusSimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) 476f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius : fSet(status) 477f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius{ 478f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 479f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusSimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) 481f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius : fSet(status) 482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_SUCCESS(status)) { 48464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert UErrorCode subStatus = U_ZERO_ERROR; 48564339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); 48664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 48764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert status = subStatus; // copy the failing status 48864339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#if FB_DEBUG 48964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 49064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#endif 49164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert return; // leaves the builder empty, if you try to use it. 49264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } 49364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus)); 49464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 49564339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert status = subStatus; // copy the failing status 49664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#if FB_DEBUG 49764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 49864339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#endif 49964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert return; // leaves the builder empty, if you try to use it. 50064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } 50164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus)); 50264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert 50364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#if FB_DEBUG 50464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert { 50564339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert UErrorCode subsub = subStatus; 50664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); 50764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } 50864339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#endif 50964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert 51064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 51164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert status = subStatus; // copy the failing status 51264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#if FB_DEBUG 51364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 51464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#endif 51564339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert return; // leaves the builder empty, if you try to use it. 51664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } 517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalUResourceBundlePointer strs; 51964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert subStatus = status; // Pick up inherited warning status now 520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); 522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strs.isValid() && U_SUCCESS(subStatus)) { 523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); 524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius suppressBreakAfter(str, status); // load the string 525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } while (strs.isValid() && U_SUCCESS(subStatus)); 527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { 528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius status = subStatus; 529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusSimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 536f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UBool r = fSet.add(exception, status); 537f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("suppressBreakAfter",&exception,r,0); 538f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return r; 539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusSimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 544f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UBool r = fSet.remove(exception, status); 545f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("unsuppressBreakAfter",&exception,r,0); 546f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return r; 547f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 548f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 549f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius/** 550f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. 551f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Work around this. 552f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * 553f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * Note: "new UnicodeString[subCount]" ends up calling global operator new 554f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius * on MSVC2012 for some reason. 555f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius */ 556f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusstatic inline UnicodeString* newUnicodeStringArray(size_t count) { 557f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return new UnicodeString[count ? count : 1]; 558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusBreakIterator * 561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusSimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { 562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<BreakIterator> adopt(adoptBreakIterator); 563fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 5641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); 5651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); 566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(status)) { 567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t revCount = 0; 571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t fwdCount = 0; 572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t subCount = fSet.size(); 574f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 575f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); 576f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 577f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius LocalArray<UnicodeString> ustrs(ustrs_ptr); 578f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 579f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius LocalMemory<int> partials; 580f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius partials.allocateInsteadAndReset(subCount); 581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. 583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." 584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int n=0; 586f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for ( int32_t i = 0; 587f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius i<fSet.size(); 588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i++) { 589f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const UnicodeString *abbr = fSet.getStringAt(i); 590f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(abbr) { 591f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("build",abbr,TRUE,i); 592f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ustrs[n] = *abbr; // copy by value 593f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i); 594f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } else { 595f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("build",abbr,FALSE,i); 596f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius status = U_MEMORY_ALLOCATION_ERROR; 597f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return NULL; 598f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius partials[n] = 0; // default: not partial 600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius n++; 601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // first pass - find partials. 603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int i=0;i<subCount;i++) { 604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations 605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nn>-1 && (nn+1)!=ustrs[i].length()) { 606f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("partial",&ustrs[i],FALSE,i); 607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // is partial. 608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // is it unique? 609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int sameAs = -1; 610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int j=0;j<subCount;j++) { 611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(j==i) continue; 612fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { 613f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("prefix",&ustrs[j],FALSE,nn+1); 614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn 615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(partials[j]==0) { // hasn't been processed yet 616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius partials[j] = kSuppressInReverse | kAddToForward; 617f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("suppressing",&ustrs[j],FALSE,j); 618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(partials[j] & kSuppressInReverse) { 619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sameAs = j; // the other entry is already in the reverse table. 620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 623f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs); 624f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]); 625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix(ustrs[i], 0, nn+1); 626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sameAs == -1 && partials[i] == 0) { 627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // first one - add the prefix to the reverse table. 628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prefix.reverse(); 629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builder->add(prefix, kPARTIAL, status); 630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius revCount++; 631f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("Added partial",&prefix,FALSE, i); 632f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); 633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius partials[i] = kSuppressInReverse | kAddToForward; 634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 635f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("NOT adding partial",&prefix,FALSE, i); 636f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); 637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int i=0;i<subCount;i++) { 641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(partials[i]==0) { 642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ustrs[i].reverse(); 643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builder->add(ustrs[i], kMATCH, status); 644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius revCount++; 645f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); 646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 647f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("Adding fwd",&ustrs[i], FALSE, i); 648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // an optimization would be to only add the portion after the '.' 650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, 651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // instead of "Ph.D." since we already know the "Ph." part is a match. 652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // would need the trie to be able to hold 0-length strings, though. 653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builder2->add(ustrs[i], kMATCH, status); // forward 654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fwdCount++; 655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius //ustrs[i].reverse(); 656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); 657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 659f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE("AbbrCount",NULL,FALSE, subCount); 660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(revCount>0) { 662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); 663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(status)) { 664f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE(u_errorName(status),NULL,FALSE, -1); 665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fwdCount>0) { 670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); 671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(status)) { 672f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FB_TRACE(u_errorName(status),NULL,FALSE, -1); 673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 677f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); 678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 681f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// ----------- Base class implementation 682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { 684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { 687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFilteredBreakIteratorBuilder * 690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { 691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(status)) return NULL; 6921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); 693c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return (U_SUCCESS(status))? ret.orphan(): NULL; 694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusFilteredBreakIteratorBuilder * 697ffdc27edd5503111189fc11165c5a11289a71f79Fredrik RoubertFilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) { 698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(status)) return NULL; 6991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); 700c14898b482f76ecab9026615e2e4c6fe78358bdcFredrik Roubert return (U_SUCCESS(status))? ret.orphan(): NULL; 701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 7050596faeddefbf198de137d5e893708495ab1584cFredrik Roubert#endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION 706