1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4********************************************************************** 5* Copyright (C) 1999-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7********************************************************************** 8* Date Name Description 9* 11/17/99 aliu Creation. 10********************************************************************** 11*/ 12 13#include "unicode/utypes.h" 14 15#if !UCONFIG_NO_TRANSLITERATION 16 17#include "unicode/rep.h" 18#include "unicode/uniset.h" 19#include "rbt_pars.h" 20#include "rbt_data.h" 21#include "rbt_rule.h" 22#include "rbt.h" 23#include "mutex.h" 24#include "umutex.h" 25 26U_NAMESPACE_BEGIN 27 28UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator) 29 30static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER; 31static Replaceable *gLockedText = NULL; 32 33void RuleBasedTransliterator::_construct(const UnicodeString& rules, 34 UTransDirection direction, 35 UParseError& parseError, 36 UErrorCode& status) { 37 fData = 0; 38 isDataOwned = TRUE; 39 if (U_FAILURE(status)) { 40 return; 41 } 42 43 TransliteratorParser parser(status); 44 parser.parse(rules, direction, parseError, status); 45 if (U_FAILURE(status)) { 46 return; 47 } 48 49 if (parser.idBlockVector.size() != 0 || 50 parser.compoundFilter != NULL || 51 parser.dataVector.size() == 0) { 52 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT 53 return; 54 } 55 56 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); 57 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 58} 59 60/** 61 * Constructs a new transliterator from the given rules. 62 * @param id the id for the transliterator. 63 * @param rules rules, separated by ';' 64 * @param direction either FORWARD or REVERSE. 65 * @param adoptedFilter the filter for this transliterator. 66 * @param parseError Struct to recieve information on position 67 * of error if an error is encountered 68 * @param status Output param set to success/failure code. 69 * @exception IllegalArgumentException if rules are malformed 70 * or direction is invalid. 71 */ 72RuleBasedTransliterator::RuleBasedTransliterator( 73 const UnicodeString& id, 74 const UnicodeString& rules, 75 UTransDirection direction, 76 UnicodeFilter* adoptedFilter, 77 UParseError& parseError, 78 UErrorCode& status) : 79 Transliterator(id, adoptedFilter) { 80 _construct(rules, direction,parseError,status); 81} 82 83/** 84 * Constructs a new transliterator from the given rules. 85 * @param id the id for the transliterator. 86 * @param rules rules, separated by ';' 87 * @param direction either FORWARD or REVERSE. 88 * @param adoptedFilter the filter for this transliterator. 89 * @param status Output param set to success/failure code. 90 * @exception IllegalArgumentException if rules are malformed 91 * or direction is invalid. 92 */ 93/*RuleBasedTransliterator::RuleBasedTransliterator( 94 const UnicodeString& id, 95 const UnicodeString& rules, 96 UTransDirection direction, 97 UnicodeFilter* adoptedFilter, 98 UErrorCode& status) : 99 Transliterator(id, adoptedFilter) { 100 UParseError parseError; 101 _construct(rules, direction,parseError, status); 102}*/ 103 104/** 105 * Covenience constructor with no filter. 106 */ 107/*RuleBasedTransliterator::RuleBasedTransliterator( 108 const UnicodeString& id, 109 const UnicodeString& rules, 110 UTransDirection direction, 111 UErrorCode& status) : 112 Transliterator(id, 0) { 113 UParseError parseError; 114 _construct(rules, direction,parseError, status); 115}*/ 116 117/** 118 * Covenience constructor with no filter and FORWARD direction. 119 */ 120/*RuleBasedTransliterator::RuleBasedTransliterator( 121 const UnicodeString& id, 122 const UnicodeString& rules, 123 UErrorCode& status) : 124 Transliterator(id, 0) { 125 UParseError parseError; 126 _construct(rules, UTRANS_FORWARD, parseError, status); 127}*/ 128 129/** 130 * Covenience constructor with FORWARD direction. 131 */ 132/*RuleBasedTransliterator::RuleBasedTransliterator( 133 const UnicodeString& id, 134 const UnicodeString& rules, 135 UnicodeFilter* adoptedFilter, 136 UErrorCode& status) : 137 Transliterator(id, adoptedFilter) { 138 UParseError parseError; 139 _construct(rules, UTRANS_FORWARD,parseError, status); 140}*/ 141 142RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 143 const TransliterationRuleData* theData, 144 UnicodeFilter* adoptedFilter) : 145 Transliterator(id, adoptedFilter), 146 fData((TransliterationRuleData*)theData), // cast away const 147 isDataOwned(FALSE) { 148 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 149} 150 151/** 152 * Internal constructor. 153 */ 154RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id, 155 TransliterationRuleData* theData, 156 UBool isDataAdopted) : 157 Transliterator(id, 0), 158 fData(theData), 159 isDataOwned(isDataAdopted) { 160 setMaximumContextLength(fData->ruleSet.getMaximumContextLength()); 161} 162 163/** 164 * Copy constructor. 165 */ 166RuleBasedTransliterator::RuleBasedTransliterator( 167 const RuleBasedTransliterator& other) : 168 Transliterator(other), fData(other.fData), 169 isDataOwned(other.isDataOwned) { 170 171 // The data object may or may not be owned. If it is not owned we 172 // share it; it is invariant. If it is owned, it's still 173 // invariant, but we need to copy it to prevent double-deletion. 174 // If this becomes a performance issue (if people do a lot of RBT 175 // copying -- unlikely) we can reference count the data object. 176 177 // Only do a deep copy if this is owned data, that is, data that 178 // will be later deleted. System transliterators contain 179 // non-owned data. 180 if (isDataOwned) { 181 fData = new TransliterationRuleData(*other.fData); 182 } 183} 184 185/** 186 * Destructor. 187 */ 188RuleBasedTransliterator::~RuleBasedTransliterator() { 189 // Delete the data object only if we own it. 190 if (isDataOwned) { 191 delete fData; 192 } 193} 194 195Transliterator* // Covariant return NOT ALLOWED (for portability) 196RuleBasedTransliterator::clone(void) const { 197 return new RuleBasedTransliterator(*this); 198} 199 200/** 201 * Implements {@link Transliterator#handleTransliterate}. 202 */ 203void 204RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, 205 UBool isIncremental) const { 206 /* We keep contextStart and contextLimit fixed the entire time, 207 * relative to the text -- contextLimit may move numerically if 208 * text is inserted or removed. The start offset moves toward 209 * limit, with replacements happening under it. 210 * 211 * Example: rules 1. ab>x|y 212 * 2. yc>z 213 * 214 * |eabcd begin - no match, advance start 215 * e|abcd match rule 1 - change text & adjust start 216 * ex|ycd match rule 2 - change text & adjust start 217 * exz|d no match, advance start 218 * exzd| done 219 */ 220 221 /* A rule like 222 * a>b|a 223 * creates an infinite loop. To prevent that, we put an arbitrary 224 * limit on the number of iterations that we take, one that is 225 * high enough that any reasonable rules are ok, but low enough to 226 * prevent a server from hanging. The limit is 16 times the 227 * number of characters n, unless n is so large that 16n exceeds a 228 * uint32_t. 229 */ 230 uint32_t loopCount = 0; 231 uint32_t loopLimit = index.limit - index.start; 232 if (loopLimit >= 0x10000000) { 233 loopLimit = 0xFFFFFFFF; 234 } else { 235 loopLimit <<= 4; 236 } 237 238 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent 239 // operations must be prevented. 240 // A Complication: compound transliterators can result in recursive entries to this 241 // function, sometimes with different "This" objects, always with the same text. 242 // Double-locking must be prevented in these cases. 243 // 244 245 UBool lockedMutexAtThisLevel = FALSE; 246 247 // Test whether this request is operating on the same text string as 248 // some other transliteration that is still in progress and holding the 249 // transliteration mutex. If so, do not lock the transliteration 250 // mutex again. 251 // 252 // gLockedText variable is protected by the global ICU mutex. 253 // Shared RBT data protected by transliteratorDataMutex. 254 // 255 // TODO(andy): Need a better scheme for handling this. 256 UBool needToLock; 257 { 258 Mutex m; 259 needToLock = (&text != gLockedText); 260 } 261 if (needToLock) { 262 umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here. 263 Mutex m; 264 gLockedText = &text; 265 lockedMutexAtThisLevel = TRUE; 266 } 267 268 // Check to make sure we don't dereference a null pointer. 269 if (fData != NULL) { 270 while (index.start < index.limit && 271 loopCount <= loopLimit && 272 fData->ruleSet.transliterate(text, index, isIncremental)) { 273 ++loopCount; 274 } 275 } 276 if (lockedMutexAtThisLevel) { 277 { 278 Mutex m; 279 gLockedText = NULL; 280 } 281 umtx_unlock(&transliteratorDataMutex); 282 } 283} 284 285UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource, 286 UBool escapeUnprintable) const { 287 return fData->ruleSet.toRules(rulesSource, escapeUnprintable); 288} 289 290/** 291 * Implement Transliterator framework 292 */ 293void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const { 294 fData->ruleSet.getSourceTargetSet(result, FALSE); 295} 296 297/** 298 * Override Transliterator framework 299 */ 300UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const { 301 return fData->ruleSet.getSourceTargetSet(result, TRUE); 302} 303 304U_NAMESPACE_END 305 306#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 307