1/* 2********************************************************************** 3* Copyright (c) 2001-2004, International Business Machines Corporation 4* and others. All Rights Reserved. 5********************************************************************** 6* Date Name Description 7* 07/23/01 aliu Creation. 8********************************************************************** 9*/ 10 11#include "unicode/utypes.h" 12 13#if !UCONFIG_NO_TRANSLITERATION 14 15#include "strmatch.h" 16#include "rbt_data.h" 17#include "util.h" 18#include "unicode/uniset.h" 19 20U_NAMESPACE_BEGIN 21 22static const UChar EMPTY[] = { 0 }; // empty string: "" 23 24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) 25 26StringMatcher::StringMatcher(const UnicodeString& theString, 27 int32_t start, 28 int32_t limit, 29 int32_t segmentNum, 30 const TransliterationRuleData& theData) : 31 data(&theData), 32 segmentNumber(segmentNum), 33 matchStart(-1), 34 matchLimit(-1) 35{ 36 theString.extractBetween(start, limit, pattern); 37} 38 39StringMatcher::StringMatcher(const StringMatcher& o) : 40 UnicodeFunctor(o), 41 UnicodeMatcher(o), 42 UnicodeReplacer(o), 43 pattern(o.pattern), 44 data(o.data), 45 segmentNumber(o.segmentNumber), 46 matchStart(o.matchStart), 47 matchLimit(o.matchLimit) 48{ 49} 50 51/** 52 * Destructor 53 */ 54StringMatcher::~StringMatcher() { 55} 56 57/** 58 * Implement UnicodeFunctor 59 */ 60UnicodeFunctor* StringMatcher::clone() const { 61 return new StringMatcher(*this); 62} 63 64/** 65 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 66 * and return the pointer. 67 */ 68UnicodeMatcher* StringMatcher::toMatcher() const { 69 return (UnicodeMatcher*) this; 70} 71 72/** 73 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 74 * and return the pointer. 75 */ 76UnicodeReplacer* StringMatcher::toReplacer() const { 77 return (UnicodeReplacer*) this; 78} 79 80/** 81 * Implement UnicodeMatcher 82 */ 83UMatchDegree StringMatcher::matches(const Replaceable& text, 84 int32_t& offset, 85 int32_t limit, 86 UBool incremental) { 87 int32_t i; 88 int32_t cursor = offset; 89 if (limit < cursor) { 90 // Match in the reverse direction 91 for (i=pattern.length()-1; i>=0; --i) { 92 UChar keyChar = pattern.charAt(i); 93 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 94 if (subm == 0) { 95 if (cursor > limit && 96 keyChar == text.charAt(cursor)) { 97 --cursor; 98 } else { 99 return U_MISMATCH; 100 } 101 } else { 102 UMatchDegree m = 103 subm->matches(text, cursor, limit, incremental); 104 if (m != U_MATCH) { 105 return m; 106 } 107 } 108 } 109 // Record the match position, but adjust for a normal 110 // forward start, limit, and only if a prior match does not 111 // exist -- we want the rightmost match. 112 if (matchStart < 0) { 113 matchStart = cursor+1; 114 matchLimit = offset+1; 115 } 116 } else { 117 for (i=0; i<pattern.length(); ++i) { 118 if (incremental && cursor == limit) { 119 // We've reached the context limit without a mismatch and 120 // without completing our match. 121 return U_PARTIAL_MATCH; 122 } 123 UChar keyChar = pattern.charAt(i); 124 UnicodeMatcher* subm = data->lookupMatcher(keyChar); 125 if (subm == 0) { 126 // Don't need the cursor < limit check if 127 // incremental is TRUE (because it's done above); do need 128 // it otherwise. 129 if (cursor < limit && 130 keyChar == text.charAt(cursor)) { 131 ++cursor; 132 } else { 133 return U_MISMATCH; 134 } 135 } else { 136 UMatchDegree m = 137 subm->matches(text, cursor, limit, incremental); 138 if (m != U_MATCH) { 139 return m; 140 } 141 } 142 } 143 // Record the match position 144 matchStart = offset; 145 matchLimit = cursor; 146 } 147 148 offset = cursor; 149 return U_MATCH; 150} 151 152/** 153 * Implement UnicodeMatcher 154 */ 155UnicodeString& StringMatcher::toPattern(UnicodeString& result, 156 UBool escapeUnprintable) const 157{ 158 result.truncate(0); 159 UnicodeString str, quoteBuf; 160 if (segmentNumber > 0) { 161 result.append((UChar)40); /*(*/ 162 } 163 for (int32_t i=0; i<pattern.length(); ++i) { 164 UChar keyChar = pattern.charAt(i); 165 const UnicodeMatcher* m = data->lookupMatcher(keyChar); 166 if (m == 0) { 167 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); 168 } else { 169 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), 170 TRUE, escapeUnprintable, quoteBuf); 171 } 172 } 173 if (segmentNumber > 0) { 174 result.append((UChar)41); /*)*/ 175 } 176 // Flush quoteBuf out to result 177 ICU_Utility::appendToRule(result, -1, 178 TRUE, escapeUnprintable, quoteBuf); 179 return result; 180} 181 182/** 183 * Implement UnicodeMatcher 184 */ 185UBool StringMatcher::matchesIndexValue(uint8_t v) const { 186 if (pattern.length() == 0) { 187 return TRUE; 188 } 189 UChar32 c = pattern.char32At(0); 190 const UnicodeMatcher *m = data->lookupMatcher(c); 191 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); 192} 193 194/** 195 * Implement UnicodeMatcher 196 */ 197void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { 198 UChar32 ch; 199 for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) { 200 ch = pattern.char32At(i); 201 const UnicodeMatcher* matcher = data->lookupMatcher(ch); 202 if (matcher == NULL) { 203 toUnionTo.add(ch); 204 } else { 205 matcher->addMatchSetTo(toUnionTo); 206 } 207 } 208} 209 210/** 211 * UnicodeReplacer API 212 */ 213int32_t StringMatcher::replace(Replaceable& text, 214 int32_t start, 215 int32_t limit, 216 int32_t& /*cursor*/) { 217 218 int32_t outLen = 0; 219 220 // Copy segment with out-of-band data 221 int32_t dest = limit; 222 // If there was no match, that means that a quantifier 223 // matched zero-length. E.g., x (a)* y matched "xy". 224 if (matchStart >= 0) { 225 if (matchStart != matchLimit) { 226 text.copy(matchStart, matchLimit, dest); 227 outLen = matchLimit - matchStart; 228 } 229 } 230 231 text.handleReplaceBetween(start, limit, EMPTY); // delete original text 232 233 return outLen; 234} 235 236/** 237 * UnicodeReplacer API 238 */ 239UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, 240 UBool /*escapeUnprintable*/) const { 241 // assert(segmentNumber > 0); 242 rule.truncate(0); 243 rule.append((UChar)0x0024 /*$*/); 244 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); 245 return rule; 246} 247 248/** 249 * Remove any match info. This must be called before performing a 250 * set of matches with this segment. 251 */ 252 void StringMatcher::resetMatch() { 253 matchStart = matchLimit = -1; 254} 255 256/** 257 * Union the set of all characters that may output by this object 258 * into the given set. 259 * @param toUnionTo the set into which to union the output characters 260 */ 261void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { 262 // The output of this replacer varies; it is the source text between 263 // matchStart and matchLimit. Since this varies depending on the 264 // input text, we can't compute it here. We can either do nothing 265 // or we can add ALL characters to the set. It's probably more useful 266 // to do nothing. 267} 268 269/** 270 * Implement UnicodeFunctor 271 */ 272void StringMatcher::setData(const TransliterationRuleData* d) { 273 data = d; 274 int32_t i = 0; 275 while (i<pattern.length()) { 276 UChar32 c = pattern.char32At(i); 277 UnicodeFunctor* f = data->lookup(c); 278 if (f != NULL) { 279 f->setData(data); 280 } 281 i += UTF_CHAR_LENGTH(c); 282 } 283} 284 285U_NAMESPACE_END 286 287#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 288 289//eof 290