1/*
2*******************************************************************************
3* Copyright (C) 2014, International Business Machines Corporation and
4* others. All Rights Reserved.
5*******************************************************************************
6*/
7
8#include "unicode/filteredbrk.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
11
12#include <unicode/ucharstriebuilder.h>
13
14#include <set>
15#include <string>
16#include <functional>
17#include "uresimp.h"
18#include "ubrkimpl.h"
19
20U_NAMESPACE_BEGIN
21
22using namespace std;
23
24static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
25static const int32_t kMATCH   = (1<<1); //< exact match - skip this one.
26static const int32_t kSuppressInReverse = (1<<0);
27static const int32_t kAddToForward = (1<<1);
28static const UChar kFULLSTOP = 0x002E; // '.'
29
30class ULISentenceBreakIterator : public BreakIterator {
31public:
32  ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
33  virtual ~ULISentenceBreakIterator() {}
34  ULISentenceBreakIterator(const ULISentenceBreakIterator& other);
35private:
36  LocalPointer<BreakIterator> fDelegate;
37  LocalUTextPointer           fText;
38  LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
39  LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
40
41  /* -- subclass interface -- */
42public:
43  /* -- cloning and other subclass stuff -- */
44  virtual BreakIterator *  createBufferClone(void * /*stackBuffer*/,
45                                             int32_t &/*BufferSize*/,
46                                             UErrorCode &status) {
47    // for now - always deep clone
48    status = U_SAFECLONE_ALLOCATED_WARNING;
49    return clone();
50  }
51  virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); }
52  virtual UClassID getDynamicClassID(void) const { return NULL; }
53  virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; }
54
55  /* -- text modifying -- */
56  virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
57  virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
58  virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
59  virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
60
61  /* -- other functions that are just delegated -- */
62  virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
63  virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
64
65  /* -- ITERATION -- */
66  virtual int32_t first(void) { return fDelegate->first(); }
67  virtual int32_t preceding(int32_t offset) { return fDelegate->preceding(offset); }
68  virtual int32_t previous(void) { return fDelegate->previous(); }
69  virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); }
70  virtual int32_t current(void) const { return fDelegate->current(); }
71
72  virtual int32_t next(void);
73
74  virtual int32_t next(int32_t n) { return fDelegate->next(n); }
75  virtual int32_t following(int32_t offset) { return fDelegate->following(offset); }
76  virtual int32_t last(void) { return fDelegate->last(); }
77
78};
79
80ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other)
81  : BreakIterator(other), fDelegate(other.fDelegate->clone())
82{
83  /*
84    TODO: not able to clone Tries. Should be a refcounted hidden master instead.
85  if(other.fBackwardsTrie.isValid()) {
86    fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone());
87  }
88  if(other.fForwardsPartialTrie.isValid()) {
89    fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone());
90  }
91  */
92}
93
94
95ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
96  BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
97  fDelegate(adopt),
98  fBackwardsTrie(backwards),
99  fForwardsPartialTrie(forwards)
100{
101  // all set..
102}
103
104int32_t ULISentenceBreakIterator::next() {
105  int32_t n = fDelegate->next();
106  if(n == UBRK_DONE || // at end  or
107     fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
108    return n;
109  }
110  // OK, do we need to break here?
111  UErrorCode status = U_ZERO_ERROR;
112  // refresh text
113  fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
114  //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
115  do { // outer loop runs once per underlying break (from fDelegate).
116    // loops while 'n' points to an exception.
117    utext_setNativeIndex(fText.getAlias(), n); // from n..
118    fBackwardsTrie->reset();
119    UChar32 uch;
120    //if(debug2) u_printf(" n@ %d\n", n);
121    // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
122    if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) {  // TODO: skip a class of chars here??
123      // TODO only do this the 1st time?
124      //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
125    } else {
126      //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
127      uch = utext_next32(fText.getAlias());
128      //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
129    }
130    UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
131
132    int32_t bestPosn = -1;
133    int32_t bestValue = -1;
134
135    while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL  &&   // more to consume backwards and..
136          USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
137      if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
138        bestPosn = utext_getNativeIndex(fText.getAlias());
139        bestValue = fBackwardsTrie->getValue();
140      }
141      //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
142    }
143
144    if(USTRINGTRIE_MATCHES(r)) { // exact match?
145      //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
146      bestValue = fBackwardsTrie->getValue();
147      bestPosn = utext_getNativeIndex(fText.getAlias());
148      //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
149    }
150
151    if(bestPosn>=0) {
152      //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
153
154      //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
155      //int32_t bestValue = fBackwardsTrie->getValue();
156      ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
157
158      if(bestValue == kMATCH) { // exact match!
159        //if(debug2) u_printf(" exact backward match\n");
160        n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
161        if(n==UBRK_DONE) return n;
162        continue; // See if the next is another exception.
163      } else if(bestValue == kPARTIAL
164                && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
165        //if(debug2) u_printf(" partial backward match\n");
166        // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
167        // to see if it matches something going forward.
168        fForwardsPartialTrie->reset();
169        UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
170        utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
171        //if(debug2) u_printf("Retrying at %d\n", bestPosn);
172        while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
173              USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) {
174          //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
175        }
176        if(USTRINGTRIE_MATCHES(rfwd)) {
177          //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
178          // only full matches here, nothing to check
179          // skip the next:
180          n = fDelegate->next();
181          if(n==UBRK_DONE) return n;
182          continue;
183        } else {
184          //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
185          // no match (no exception) -return the 'underlying' break
186          return n;
187        }
188      } else {
189        return n; // internal error and/or no forwards trie
190      }
191    } else {
192      //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r);  // no best match
193      return n; // No match - so exit. Not an exception.
194    }
195  } while(n != UBRK_DONE);
196  return n;
197}
198
199U_NAMESPACE_END
200
201#if 0
202// Would improve performance - but, platform issues.
203// for the 'set'
204namespace std {
205  template <> struct hash<icu::UnicodeString> {
206    size_t operator()( const UnicodeString& str ) const {
207      return (size_t)str.hashCode();
208    }
209  };
210}
211#endif
212
213U_NAMESPACE_BEGIN
214
215class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
216public:
217  virtual ~SimpleFilteredBreakIteratorBuilder();
218  SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
219  SimpleFilteredBreakIteratorBuilder();
220  virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
221  virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
222  virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
223private:
224  set<UnicodeString> fSet;
225};
226
227SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
228{
229}
230
231SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
232  : fSet()
233{
234  if(U_SUCCESS(status)) {
235    LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status));
236    LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status));
237    LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status));
238    if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it.
239
240    LocalUResourceBundlePointer strs;
241    UErrorCode subStatus = status;
242    do {
243      strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
244      if(strs.isValid() && U_SUCCESS(subStatus)) {
245        UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
246        suppressBreakAfter(str, status); // load the string
247      }
248    } while (strs.isValid() && U_SUCCESS(subStatus));
249    if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
250      status = subStatus;
251    }
252  }
253}
254
255SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder()
256  : fSet()
257{
258}
259
260UBool
261SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
262{
263  if( U_FAILURE(status) ) return FALSE;
264  return fSet.insert(exception).second;
265}
266
267UBool
268SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
269{
270  if( U_FAILURE(status) ) return FALSE;
271  return ((fSet.erase(exception)) != 0);
272}
273
274BreakIterator *
275SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
276  LocalPointer<BreakIterator> adopt(adoptBreakIterator);
277
278  if(U_FAILURE(status)) {
279    return NULL;
280  }
281
282  LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status));
283  LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status));
284
285  int32_t revCount = 0;
286  int32_t fwdCount = 0;
287
288  int32_t subCount = fSet.size();
289  LocalArray<UnicodeString> ustrs(new UnicodeString[subCount]);
290  LocalArray<int> partials(new int[subCount]);
291
292  LocalPointer<UCharsTrie>    backwardsTrie; //  i.e. ".srM" for Mrs.
293  LocalPointer<UCharsTrie>    forwardsPartialTrie; //  Has ".a" for "a.M."
294
295  int n=0;
296  for ( set<UnicodeString>::iterator i = fSet.begin();
297        i != fSet.end();
298        i++) {
299    const UnicodeString &abbr = *i;
300    ustrs[n] = abbr;
301    partials[n] = 0; // default: not partial
302    n++;
303  }
304  // first pass - find partials.
305  for(int i=0;i<subCount;i++) {
306    int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
307    if(nn>-1 && (nn+1)!=ustrs[i].length()) {
308      //if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer());
309      // is partial.
310      // is it unique?
311      int sameAs = -1;
312      for(int j=0;j<subCount;j++) {
313        if(j==i) continue;
314        if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
315          //if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1);
316          //UBool otherIsPartial = ((nn+1)!=ustrs[j].length());  // true if ustrs[j] doesn't end at nn
317          if(partials[j]==0) { // hasn't been processed yet
318            partials[j] = kSuppressInReverse | kAddToForward;
319            //if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer());
320          } else if(partials[j] & kSuppressInReverse) {
321            sameAs = j; // the other entry is already in the reverse table.
322          }
323        }
324      }
325      //if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n",      ustrs[i].getTerminatedBuffer(), sameAs, partials[i]);
326      UnicodeString prefix(ustrs[i], 0, nn+1);
327      if(sameAs == -1 && partials[i] == 0) {
328        // first one - add the prefix to the reverse table.
329        prefix.reverse();
330        builder->add(prefix, kPARTIAL, status);
331        revCount++;
332        //if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status));
333        partials[i] = kSuppressInReverse | kAddToForward;
334      } else {
335        //if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer());
336      }
337    }
338  }
339  for(int i=0;i<subCount;i++) {
340    if(partials[i]==0) {
341      ustrs[i].reverse();
342      builder->add(ustrs[i], kMATCH, status);
343      revCount++;
344      //if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status));
345    } else {
346      //if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer());
347
348      // an optimization would be to only add the portion after the '.'
349      // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
350      // instead of "Ph.D." since we already know the "Ph." part is a match.
351      // would need the trie to be able to hold 0-length strings, though.
352      builder2->add(ustrs[i], kMATCH, status); // forward
353      fwdCount++;
354      //ustrs[i].reverse();
355      ////if(debug2) u_printf("SUPPRESS- not Added(%d):  /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
356    }
357  }
358  //if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount);
359
360  if(revCount>0) {
361    backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
362    if(U_FAILURE(status)) {
363      //printf("Error %s building backwards\n", u_errorName(status));
364      return NULL;
365    }
366  }
367
368  if(fwdCount>0) {
369    forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
370    if(U_FAILURE(status)) {
371      //printf("Error %s building forwards\n", u_errorName(status));
372      return NULL;
373    }
374  }
375
376  return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
377}
378
379
380// -----------
381
382FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
383}
384
385FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
386}
387
388FilteredBreakIteratorBuilder *
389FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
390  if(U_FAILURE(status)) return NULL;
391
392  LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status));
393  if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
394  return ret.orphan();
395}
396
397FilteredBreakIteratorBuilder *
398FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
399  if(U_FAILURE(status)) return NULL;
400
401  LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder());
402  if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
403  return ret.orphan();
404}
405
406U_NAMESPACE_END
407
408#endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
409