1c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/*
2c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru**********************************************************************
327f654740f2a26ad62a5c155af9199af9e69b889claireho*   Copyright (C) 2008-2010, International Business Machines
4c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*   Corporation and others.  All Rights Reserved.
5c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru**********************************************************************
6c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*   Date        Name        Description
7c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*   05/11/2008  Andy Heninger  Port from Java
8c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru**********************************************************************
9c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*/
10c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
11c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/utypes.h"
12c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
13c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
14c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
15c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/unifilt.h"
16c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/uchar.h"
17c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/uniset.h"
18c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/brkiter.h"
19c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "brktrans.h"
20c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/uchar.h"
21c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "cmemory.h"
22c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uprops.h"
23c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uinvchar.h"
24c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "util.h"
25c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uvectr32.h"
26c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
27c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_BEGIN
28c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
29c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
30c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
31c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar SPACE       = 32;  // ' '
32c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
33c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
34c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/**
35c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Constructs a transliterator with the default delimiters '{' and
36c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * '}'.
37c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */
38c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    fInsertion(SPACE) {
41c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        bi = NULL;
42c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        UErrorCode status = U_ZERO_ERROR;
43c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        boundaries = new UVector32(status);
44c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    }
45c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
46c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
47c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/**
48c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Destructor.
49c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */
50c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakTransliterator::~BreakTransliterator() {
51c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    delete bi;
52c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    bi = NULL;
53c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    delete boundaries;
54c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    boundaries = NULL;
55c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
56c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
57c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/**
58c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Copy constructor.
59c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */
60c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
61c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    Transliterator(o) {
62c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        bi = NULL;
63c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        if (o.bi != NULL) {
64c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            bi = o.bi->clone();
65c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        }
66c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        fInsertion = o.fInsertion;
67c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        UErrorCode status = U_ZERO_ERROR;
68c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        boundaries = new UVector32(status);
69c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    }
70c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
71c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
72c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/**
73c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Transliterator API.
74c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */
75c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruTransliterator* BreakTransliterator::clone(void) const {
76c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    return new BreakTransliterator(*this);
77c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
78c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
79c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/**
80c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Implements {@link Transliterator#handleTransliterate}.
81c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */
82c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruvoid BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
83c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru                                                    UBool isIncremental ) const {
84c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
85c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        UErrorCode status = U_ZERO_ERROR;
86c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        boundaries->removeAllElements();
87c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        BreakTransliterator *nonConstThis = (BreakTransliterator *)this;
88c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        nonConstThis->getBreakIterator(); // Lazy-create it if necessary
89c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        UnicodeString sText = replaceableAsString(text);
90c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        bi->setText(sText);
91c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        bi->preceding(offsets.start);
92c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
93c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        // To make things much easier, we will stack the boundaries, and then insert at the end.
94c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        // generally, we won't need too many, since we will be filtered.
95c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
96c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        int32_t boundary;
97c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
98c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            if (boundary == 0) continue;
99c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            // HACK: Check to see that preceeding item was a letter
100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            UChar32 cp = sText.char32At(boundary-1);
102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            int type = u_charType(cp);
103c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
104c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
105c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
106c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            cp = sText.char32At(boundary);
107c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            type = u_charType(cp);
108c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            boundaries->addElement(boundary, status);
112b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru            // printf("Boundary at %d\n", boundary);
113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        }
114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        int delta = 0;
116c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        int lastBoundary = 0;
117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        if (boundaries->size() != 0) { // if we found something, adjust
119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            delta = boundaries->size() * fInsertion.length();
120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            lastBoundary = boundaries->lastElementi();
121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            // we do this from the end backwards, so that we don't have to keep updating.
123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            while (boundaries->size() > 0) {
125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru                boundary = boundaries->popi();
126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru                text.handleReplaceBetween(boundary, boundary, fInsertion);
127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru            }
128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        }
129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        // Now fix up the return values
131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        offsets.contextLimit += delta;
132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        offsets.limit += delta;
133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        // TODO:  do something with U_FAILURE(status);
136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        //        (need to look at transliterators overall, not just here.)
137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//  getInsertion()
141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruconst UnicodeString &BreakTransliterator::getInsertion() const {
143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    return fInsertion;
144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
146c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//  setInsertion()
148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
14950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid BreakTransliterator::setInsertion(const UnicodeString &insertion) {
150c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    this->fInsertion = insertion;
151c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
152c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
153c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
154c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//  getBreakIterator     Lazily create the break iterator if it does
155c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//                       not already exist.  Copied from Java, probably
156c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//                       better to just create it in the constructor.
157c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
158c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakIterator *BreakTransliterator::getBreakIterator() {
159c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    UErrorCode status = U_ZERO_ERROR;
160c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    if (bi == NULL) {
161c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        // Note:  Thai breaking behavior is universal, it is not
162c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        //        tied to the Thai locale.
163c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru        bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
164c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    }
165c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    return bi;
166c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
167c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
168c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
169c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//   replaceableAsString   Hack to let break iterators work
170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//                         on the replaceable text from transliterators.
171c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//                         In practice, the only real Replaceable type that we
172c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//                         will be seeing is UnicodeString, so this function
173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//                         will normally be efficient.
174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//
175c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
176c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    UnicodeString s;
17727f654740f2a26ad62a5c155af9199af9e69b889claireho    UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
17827f654740f2a26ad62a5c155af9199af9e69b889claireho    if (rs != NULL) {
17927f654740f2a26ad62a5c155af9199af9e69b889claireho        s = *rs;
18027f654740f2a26ad62a5c155af9199af9e69b889claireho    } else {
18127f654740f2a26ad62a5c155af9199af9e69b889claireho        r.extractBetween(0, r.length(), s);
18227f654740f2a26ad62a5c155af9199af9e69b889claireho    }
183c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru    return s;
184c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru}
185c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
186c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_END
187c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru
188c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_TRANSLITERATION */
189