1c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/* 2c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru********************************************************************** 327f654740f2a26ad62a5c155af9199af9e69b889claireho* Copyright (C) 2008-2010, International Business Machines 4c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru********************************************************************** 6c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* Date Name Description 7c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru* 05/11/2008 Andy Heninger Port from Java 8c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru********************************************************************** 9c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru*/ 10c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 11c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/utypes.h" 12c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 13c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION 14c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 15c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/unifilt.h" 16c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/uchar.h" 17c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/uniset.h" 18c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/brkiter.h" 19c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "brktrans.h" 20c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "unicode/uchar.h" 21c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "cmemory.h" 22c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uprops.h" 23c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uinvchar.h" 24c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "util.h" 25c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uvectr32.h" 26c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 27c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_BEGIN 28c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 29c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) 30c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 31c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar SPACE = 32; // ' ' 32c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 33c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 34c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 35c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Constructs a transliterator with the default delimiters '{' and 36c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * '}'. 37c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 38c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : 39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), 40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru fInsertion(SPACE) { 41c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi = NULL; 42c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 43c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru boundaries = new UVector32(status); 44c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 45c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 46c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 47c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 48c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Destructor. 49c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 50c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakTransliterator::~BreakTransliterator() { 51c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete bi; 52c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi = NULL; 53c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete boundaries; 54c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru boundaries = NULL; 55c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 56c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 57c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 58c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Copy constructor. 59c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 60c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : 61c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru Transliterator(o) { 62c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi = NULL; 63c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (o.bi != NULL) { 64c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi = o.bi->clone(); 65c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 66c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fInsertion = o.fInsertion; 67c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 68c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru boundaries = new UVector32(status); 69c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 70c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 71c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 72c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 73c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Transliterator API. 74c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 75c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruTransliterator* BreakTransliterator::clone(void) const { 76c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return new BreakTransliterator(*this); 77c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 78c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 79c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru/** 80c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru * Implements {@link Transliterator#handleTransliterate}. 81c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru */ 82c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruvoid BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 83c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool isIncremental ) const { 84c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 85c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 86c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru boundaries->removeAllElements(); 87c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru BreakTransliterator *nonConstThis = (BreakTransliterator *)this; 88c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nonConstThis->getBreakIterator(); // Lazy-create it if necessary 89c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString sText = replaceableAsString(text); 90c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi->setText(sText); 91c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi->preceding(offsets.start); 92c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 93c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // To make things much easier, we will stack the boundaries, and then insert at the end. 94c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // generally, we won't need too many, since we will be filtered. 95c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 96c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t boundary; 97c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { 98c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (boundary == 0) continue; 99c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // HACK: Check to see that preceeding item was a letter 100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 cp = sText.char32At(boundary-1); 102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int type = u_charType(cp); 103c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru //System.out.println(Integer.toString(cp,16) + " (before): " + type); 104c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 105c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 106c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru cp = sText.char32At(boundary); 107c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru type = u_charType(cp); 108c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru //System.out.println(Integer.toString(cp,16) + " (after): " + type); 109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru boundaries->addElement(boundary, status); 112b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // printf("Boundary at %d\n", boundary); 113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int delta = 0; 116c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int lastBoundary = 0; 117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (boundaries->size() != 0) { // if we found something, adjust 119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delta = boundaries->size() * fInsertion.length(); 120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru lastBoundary = boundaries->lastElementi(); 121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // we do this from the end backwards, so that we don't have to keep updating. 123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while (boundaries->size() > 0) { 125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru boundary = boundaries->popi(); 126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru text.handleReplaceBetween(boundary, boundary, fInsertion); 127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Now fix up the return values 131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offsets.contextLimit += delta; 132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offsets.limit += delta; 133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; 134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: do something with U_FAILURE(status); 136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (need to look at transliterators overall, not just here.) 137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// getInsertion() 141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruconst UnicodeString &BreakTransliterator::getInsertion() const { 143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return fInsertion; 144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 146c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// setInsertion() 148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 14950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid BreakTransliterator::setInsertion(const UnicodeString &insertion) { 150c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru this->fInsertion = insertion; 151c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 152c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 153c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 154c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// getBreakIterator Lazily create the break iterator if it does 155c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// not already exist. Copied from Java, probably 156c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// better to just create it in the constructor. 157c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 158c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruBreakIterator *BreakTransliterator::getBreakIterator() { 159c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 160c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (bi == NULL) { 161c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note: Thai breaking behavior is universal, it is not 162c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // tied to the Thai locale. 163c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru bi = BreakIterator::createWordInstance(Locale::getEnglish(), status); 164c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 165c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return bi; 166c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 167c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 168c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 169c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// replaceableAsString Hack to let break iterators work 170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// on the replaceable text from transliterators. 171c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// In practice, the only real Replaceable type that we 172c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// will be seeing is UnicodeString, so this function 173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// will normally be efficient. 174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 175c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { 176c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString s; 17727f654740f2a26ad62a5c155af9199af9e69b889claireho UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); 17827f654740f2a26ad62a5c155af9199af9e69b889claireho if (rs != NULL) { 17927f654740f2a26ad62a5c155af9199af9e69b889claireho s = *rs; 18027f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 18127f654740f2a26ad62a5c155af9199af9e69b889claireho r.extractBetween(0, r.length(), s); 18227f654740f2a26ad62a5c155af9199af9e69b889claireho } 183c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return s; 184c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 185c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 186c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruU_NAMESPACE_END 187c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 188c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 189