1/* 2 ********************************************************************** 3 * Copyright (C) 1999-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Date Name Description 7 * 11/17/99 aliu Creation. 8 ********************************************************************** 9 */ 10 11#include "unicode/utypes.h" 12 13#if !UCONFIG_NO_TRANSLITERATION 14 15#include "unicode/uobject.h" 16#include "unicode/parseerr.h" 17#include "unicode/parsepos.h" 18#include "unicode/putil.h" 19#include "unicode/uchar.h" 20#include "unicode/ustring.h" 21#include "unicode/uniset.h" 22#include "unicode/utf16.h" 23#include "cstring.h" 24#include "funcrepl.h" 25#include "hash.h" 26#include "quant.h" 27#include "rbt.h" 28#include "rbt_data.h" 29#include "rbt_pars.h" 30#include "rbt_rule.h" 31#include "strmatch.h" 32#include "strrepl.h" 33#include "unicode/symtable.h" 34#include "tridpars.h" 35#include "uvector.h" 36#include "hash.h" 37#include "patternprops.h" 38#include "util.h" 39#include "cmemory.h" 40#include "uprops.h" 41#include "putilimp.h" 42 43// Operators 44#define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/ 45#define FORWARD_RULE_OP ((UChar)0x003E) /*>*/ 46#define REVERSE_RULE_OP ((UChar)0x003C) /*<*/ 47#define FWDREV_RULE_OP ((UChar)0x007E) /*~*/ // internal rep of <> op 48 49// Other special characters 50#define QUOTE ((UChar)0x0027) /*'*/ 51#define ESCAPE ((UChar)0x005C) /*\*/ 52#define END_OF_RULE ((UChar)0x003B) /*;*/ 53#define RULE_COMMENT_CHAR ((UChar)0x0023) /*#*/ 54 55#define SEGMENT_OPEN ((UChar)0x0028) /*(*/ 56#define SEGMENT_CLOSE ((UChar)0x0029) /*)*/ 57#define CONTEXT_ANTE ((UChar)0x007B) /*{*/ 58#define CONTEXT_POST ((UChar)0x007D) /*}*/ 59#define CURSOR_POS ((UChar)0x007C) /*|*/ 60#define CURSOR_OFFSET ((UChar)0x0040) /*@*/ 61#define ANCHOR_START ((UChar)0x005E) /*^*/ 62#define KLEENE_STAR ((UChar)0x002A) /***/ 63#define ONE_OR_MORE ((UChar)0x002B) /*+*/ 64#define ZERO_OR_ONE ((UChar)0x003F) /*?*/ 65 66#define DOT ((UChar)46) /*.*/ 67 68static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; 69 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, 70 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 71}; 72 73// A function is denoted &Source-Target/Variant(text) 74#define FUNCTION ((UChar)38) /*&*/ 75 76// Aliases for some of the syntax characters. These are provided so 77// transliteration rules can be expressed in XML without clashing with 78// XML syntax characters '<', '>', and '&'. 79#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow 80#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow 81#define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow 82#define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta) 83 84// Special characters disallowed at the top level 85static const UChar ILLEGAL_TOP[] = {41,0}; // ")" 86 87// Special characters disallowed within a segment 88static const UChar ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@" 89 90// Special characters disallowed within a function argument 91static const UChar ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@" 92 93// By definition, the ANCHOR_END special character is a 94// trailing SymbolTable.SYMBOL_REF character. 95// private static final char ANCHOR_END = '$'; 96 97static const UChar gOPERATORS[] = { // "=><" 98 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 99 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 100 0 101}; 102 103static const UChar HALF_ENDERS[] = { // "=><;" 104 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 105 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 106 END_OF_RULE, 107 0 108}; 109 110// These are also used in Transliterator::toRules() 111static const int32_t ID_TOKEN_LEN = 2; 112static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' 113 114/* 115commented out until we do real ::BEGIN/::END functionality 116static const int32_t BEGIN_TOKEN_LEN = 5; 117static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' 118 119static const int32_t END_TOKEN_LEN = 3; 120static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' 121*/ 122 123U_NAMESPACE_BEGIN 124 125//---------------------------------------------------------------------- 126// BEGIN ParseData 127//---------------------------------------------------------------------- 128 129/** 130 * This class implements the SymbolTable interface. It is used 131 * during parsing to give UnicodeSet access to variables that 132 * have been defined so far. Note that it uses variablesVector, 133 * _not_ data.setVariables. 134 */ 135class ParseData : public UMemory, public SymbolTable { 136public: 137 const TransliterationRuleData* data; // alias 138 139 const UVector* variablesVector; // alias 140 141 const Hashtable* variableNames; // alias 142 143 ParseData(const TransliterationRuleData* data = 0, 144 const UVector* variablesVector = 0, 145 const Hashtable* variableNames = 0); 146 147 virtual ~ParseData(); 148 149 virtual const UnicodeString* lookup(const UnicodeString& s) const; 150 151 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 152 153 virtual UnicodeString parseReference(const UnicodeString& text, 154 ParsePosition& pos, int32_t limit) const; 155 /** 156 * Return true if the given character is a matcher standin or a plain 157 * character (non standin). 158 */ 159 UBool isMatcher(UChar32 ch); 160 161 /** 162 * Return true if the given character is a replacer standin or a plain 163 * character (non standin). 164 */ 165 UBool isReplacer(UChar32 ch); 166 167private: 168 ParseData(const ParseData &other); // forbid copying of this class 169 ParseData &operator=(const ParseData &other); // forbid copying of this class 170}; 171 172ParseData::ParseData(const TransliterationRuleData* d, 173 const UVector* sets, 174 const Hashtable* vNames) : 175 data(d), variablesVector(sets), variableNames(vNames) {} 176 177ParseData::~ParseData() {} 178 179/** 180 * Implement SymbolTable API. 181 */ 182const UnicodeString* ParseData::lookup(const UnicodeString& name) const { 183 return (const UnicodeString*) variableNames->get(name); 184} 185 186/** 187 * Implement SymbolTable API. 188 */ 189const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { 190 // Note that we cannot use data.lookupSet() because the 191 // set array has not been constructed yet. 192 const UnicodeFunctor* set = NULL; 193 int32_t i = ch - data->variablesBase; 194 if (i >= 0 && i < variablesVector->size()) { 195 int32_t i = ch - data->variablesBase; 196 set = (i < variablesVector->size()) ? 197 (UnicodeFunctor*) variablesVector->elementAt(i) : 0; 198 } 199 return set; 200} 201 202/** 203 * Implement SymbolTable API. Parse out a symbol reference 204 * name. 205 */ 206UnicodeString ParseData::parseReference(const UnicodeString& text, 207 ParsePosition& pos, int32_t limit) const { 208 int32_t start = pos.getIndex(); 209 int32_t i = start; 210 UnicodeString result; 211 while (i < limit) { 212 UChar c = text.charAt(i); 213 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 214 break; 215 } 216 ++i; 217 } 218 if (i == start) { // No valid name chars 219 return result; // Indicate failure with empty string 220 } 221 pos.setIndex(i); 222 text.extractBetween(start, i, result); 223 return result; 224} 225 226UBool ParseData::isMatcher(UChar32 ch) { 227 // Note that we cannot use data.lookup() because the 228 // set array has not been constructed yet. 229 int32_t i = ch - data->variablesBase; 230 if (i >= 0 && i < variablesVector->size()) { 231 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 232 return f != NULL && f->toMatcher() != NULL; 233 } 234 return TRUE; 235} 236 237/** 238 * Return true if the given character is a replacer standin or a plain 239 * character (non standin). 240 */ 241UBool ParseData::isReplacer(UChar32 ch) { 242 // Note that we cannot use data.lookup() because the 243 // set array has not been constructed yet. 244 int i = ch - data->variablesBase; 245 if (i >= 0 && i < variablesVector->size()) { 246 UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); 247 return f != NULL && f->toReplacer() != NULL; 248 } 249 return TRUE; 250} 251 252//---------------------------------------------------------------------- 253// BEGIN RuleHalf 254//---------------------------------------------------------------------- 255 256/** 257 * A class representing one side of a rule. This class knows how to 258 * parse half of a rule. It is tightly coupled to the method 259 * RuleBasedTransliterator.Parser.parseRule(). 260 */ 261class RuleHalf : public UMemory { 262 263public: 264 265 UnicodeString text; 266 267 int32_t cursor; // position of cursor in text 268 int32_t ante; // position of ante context marker '{' in text 269 int32_t post; // position of post context marker '}' in text 270 271 // Record the offset to the cursor either to the left or to the 272 // right of the key. This is indicated by characters on the output 273 // side that allow the cursor to be positioned arbitrarily within 274 // the matching text. For example, abc{def} > | @@@ xyz; changes 275 // def to xyz and moves the cursor to before abc. Offset characters 276 // must be at the start or end, and they cannot move the cursor past 277 // the ante- or postcontext text. Placeholders are only valid in 278 // output text. The length of the ante and post context is 279 // determined at runtime, because of supplementals and quantifiers. 280 int32_t cursorOffset; // only nonzero on output side 281 282 // Position of first CURSOR_OFFSET on _right_. This will be -1 283 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 284 int32_t cursorOffsetPos; 285 286 UBool anchorStart; 287 UBool anchorEnd; 288 289 /** 290 * The segment number from 1..n of the next '(' we see 291 * during parsing; 1-based. 292 */ 293 int32_t nextSegmentNumber; 294 295 TransliteratorParser& parser; 296 297 //-------------------------------------------------- 298 // Methods 299 300 RuleHalf(TransliteratorParser& parser); 301 ~RuleHalf(); 302 303 int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 304 305 int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 306 UnicodeString& buf, 307 const UnicodeString& illegal, 308 UBool isSegment, 309 UErrorCode& status); 310 311 /** 312 * Remove context. 313 */ 314 void removeContext(); 315 316 /** 317 * Return true if this half looks like valid output, that is, does not 318 * contain quantifiers or other special input-only elements. 319 */ 320 UBool isValidOutput(TransliteratorParser& parser); 321 322 /** 323 * Return true if this half looks like valid input, that is, does not 324 * contain functions or other special output-only elements. 325 */ 326 UBool isValidInput(TransliteratorParser& parser); 327 328 int syntaxError(UErrorCode code, 329 const UnicodeString& rule, 330 int32_t start, 331 UErrorCode& status) { 332 return parser.syntaxError(code, rule, start, status); 333 } 334 335private: 336 // Disallowed methods; no impl. 337 RuleHalf(const RuleHalf&); 338 RuleHalf& operator=(const RuleHalf&); 339}; 340 341RuleHalf::RuleHalf(TransliteratorParser& p) : 342 parser(p) 343{ 344 cursor = -1; 345 ante = -1; 346 post = -1; 347 cursorOffset = 0; 348 cursorOffsetPos = 0; 349 anchorStart = anchorEnd = FALSE; 350 nextSegmentNumber = 1; 351} 352 353RuleHalf::~RuleHalf() { 354} 355 356/** 357 * Parse one side of a rule, stopping at either the limit, 358 * the END_OF_RULE character, or an operator. 359 * @return the index after the terminating character, or 360 * if limit was reached, limit 361 */ 362int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 363 int32_t start = pos; 364 text.truncate(0); 365 pos = parseSection(rule, pos, limit, text, UnicodeString(TRUE, ILLEGAL_TOP, -1), FALSE, status); 366 367 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 368 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 369 } 370 371 return pos; 372} 373 374/** 375 * Parse a section of one side of a rule, stopping at either 376 * the limit, the END_OF_RULE character, an operator, or a 377 * segment close character. This method parses both a 378 * top-level rule half and a segment within such a rule half. 379 * It calls itself recursively to parse segments and nested 380 * segments. 381 * @param buf buffer into which to accumulate the rule pattern 382 * characters, either literal characters from the rule or 383 * standins for UnicodeMatcher objects including segments. 384 * @param illegal the set of special characters that is illegal during 385 * this parse. 386 * @param isSegment if true, then we've already seen a '(' and 387 * pos on entry points right after it. Accumulate everything 388 * up to the closing ')', put it in a segment matcher object, 389 * generate a standin for it, and add the standin to buf. As 390 * a side effect, update the segments vector with a reference 391 * to the segment matcher. This works recursively for nested 392 * segments. If isSegment is false, just accumulate 393 * characters into buf. 394 * @return the index after the terminating character, or 395 * if limit was reached, limit 396 */ 397int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 398 UnicodeString& buf, 399 const UnicodeString& illegal, 400 UBool isSegment, UErrorCode& status) { 401 int32_t start = pos; 402 ParsePosition pp; 403 UnicodeString scratch; 404 UBool done = FALSE; 405 int32_t quoteStart = -1; // Most recent 'single quoted string' 406 int32_t quoteLimit = -1; 407 int32_t varStart = -1; // Most recent $variableReference 408 int32_t varLimit = -1; 409 int32_t bufStart = buf.length(); 410 411 while (pos < limit && !done) { 412 // Since all syntax characters are in the BMP, fetching 413 // 16-bit code units suffices here. 414 UChar c = rule.charAt(pos++); 415 if (PatternProps::isWhiteSpace(c)) { 416 // Ignore whitespace. Note that this is not Unicode 417 // spaces, but Java spaces -- a subset, representing 418 // whitespace likely to be seen in code. 419 continue; 420 } 421 if (u_strchr(HALF_ENDERS, c) != NULL) { 422 if (isSegment) { 423 // Unclosed segment 424 return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status); 425 } 426 break; 427 } 428 if (anchorEnd) { 429 // Text after a presumed end anchor is a syntax err 430 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status); 431 } 432 if (UnicodeSet::resemblesPattern(rule, pos-1)) { 433 pp.setIndex(pos-1); // Backup to opening '[' 434 buf.append(parser.parseSet(rule, pp, status)); 435 if (U_FAILURE(status)) { 436 return syntaxError(U_MALFORMED_SET, rule, start, status); 437 } 438 pos = pp.getIndex(); 439 continue; 440 } 441 // Handle escapes 442 if (c == ESCAPE) { 443 if (pos == limit) { 444 return syntaxError(U_TRAILING_BACKSLASH, rule, start, status); 445 } 446 UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\' 447 if (escaped == (UChar32) -1) { 448 return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status); 449 } 450 if (!parser.checkVariableRange(escaped)) { 451 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 452 } 453 buf.append(escaped); 454 continue; 455 } 456 // Handle quoted matter 457 if (c == QUOTE) { 458 int32_t iq = rule.indexOf(QUOTE, pos); 459 if (iq == pos) { 460 buf.append(c); // Parse [''] outside quotes as ['] 461 ++pos; 462 } else { 463 /* This loop picks up a run of quoted text of the 464 * form 'aaaa' each time through. If this run 465 * hasn't really ended ('aaaa''bbbb') then it keeps 466 * looping, each time adding on a new run. When it 467 * reaches the final quote it breaks. 468 */ 469 quoteStart = buf.length(); 470 for (;;) { 471 if (iq < 0) { 472 return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status); 473 } 474 scratch.truncate(0); 475 rule.extractBetween(pos, iq, scratch); 476 buf.append(scratch); 477 pos = iq+1; 478 if (pos < limit && rule.charAt(pos) == QUOTE) { 479 // Parse [''] inside quotes as ['] 480 iq = rule.indexOf(QUOTE, pos+1); 481 // Continue looping 482 } else { 483 break; 484 } 485 } 486 quoteLimit = buf.length(); 487 488 for (iq=quoteStart; iq<quoteLimit; ++iq) { 489 if (!parser.checkVariableRange(buf.charAt(iq))) { 490 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 491 } 492 } 493 } 494 continue; 495 } 496 497 if (!parser.checkVariableRange(c)) { 498 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 499 } 500 501 if (illegal.indexOf(c) >= 0) { 502 syntaxError(U_ILLEGAL_CHARACTER, rule, start, status); 503 } 504 505 switch (c) { 506 507 //------------------------------------------------------ 508 // Elements allowed within and out of segments 509 //------------------------------------------------------ 510 case ANCHOR_START: 511 if (buf.length() == 0 && !anchorStart) { 512 anchorStart = TRUE; 513 } else { 514 return syntaxError(U_MISPLACED_ANCHOR_START, 515 rule, start, status); 516 } 517 break; 518 case SEGMENT_OPEN: 519 { 520 // bufSegStart is the offset in buf to the first 521 // character of the segment we are parsing. 522 int32_t bufSegStart = buf.length(); 523 524 // Record segment number now, since nextSegmentNumber 525 // will be incremented during the call to parseSection 526 // if there are nested segments. 527 int32_t segmentNumber = nextSegmentNumber++; // 1-based 528 529 // Parse the segment 530 pos = parseSection(rule, pos, limit, buf, UnicodeString(TRUE, ILLEGAL_SEG, -1), TRUE, status); 531 532 // After parsing a segment, the relevant characters are 533 // in buf, starting at offset bufSegStart. Extract them 534 // into a string matcher, and replace them with a 535 // standin for that matcher. 536 StringMatcher* m = 537 new StringMatcher(buf, bufSegStart, buf.length(), 538 segmentNumber, *parser.curData); 539 if (m == NULL) { 540 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 541 } 542 543 // Record and associate object and segment number 544 parser.setSegmentObject(segmentNumber, m, status); 545 buf.truncate(bufSegStart); 546 buf.append(parser.getSegmentStandin(segmentNumber, status)); 547 } 548 break; 549 case FUNCTION: 550 case ALT_FUNCTION: 551 { 552 int32_t iref = pos; 553 TransliteratorIDParser::SingleID* single = 554 TransliteratorIDParser::parseFilterID(rule, iref); 555 // The next character MUST be a segment open 556 if (single == NULL || 557 !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) { 558 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 559 } 560 561 Transliterator *t = single->createInstance(); 562 delete single; 563 if (t == NULL) { 564 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 565 } 566 567 // bufSegStart is the offset in buf to the first 568 // character of the segment we are parsing. 569 int32_t bufSegStart = buf.length(); 570 571 // Parse the segment 572 pos = parseSection(rule, iref, limit, buf, UnicodeString(TRUE, ILLEGAL_FUNC, -1), TRUE, status); 573 574 // After parsing a segment, the relevant characters are 575 // in buf, starting at offset bufSegStart. 576 UnicodeString output; 577 buf.extractBetween(bufSegStart, buf.length(), output); 578 FunctionReplacer *r = 579 new FunctionReplacer(t, new StringReplacer(output, parser.curData)); 580 if (r == NULL) { 581 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 582 } 583 584 // Replace the buffer contents with a stand-in 585 buf.truncate(bufSegStart); 586 buf.append(parser.generateStandInFor(r, status)); 587 } 588 break; 589 case SymbolTable::SYMBOL_REF: 590 // Handle variable references and segment references "$1" .. "$9" 591 { 592 // A variable reference must be followed immediately 593 // by a Unicode identifier start and zero or more 594 // Unicode identifier part characters, or by a digit 595 // 1..9 if it is a segment reference. 596 if (pos == limit) { 597 // A variable ref character at the end acts as 598 // an anchor to the context limit, as in perl. 599 anchorEnd = TRUE; 600 break; 601 } 602 // Parse "$1" "$2" .. "$9" .. (no upper limit) 603 c = rule.charAt(pos); 604 int32_t r = u_digit(c, 10); 605 if (r >= 1 && r <= 9) { 606 r = ICU_Utility::parseNumber(rule, pos, 10); 607 if (r < 0) { 608 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, 609 rule, start, status); 610 } 611 buf.append(parser.getSegmentStandin(r, status)); 612 } else { 613 pp.setIndex(pos); 614 UnicodeString name = parser.parseData-> 615 parseReference(rule, pp, limit); 616 if (name.length() == 0) { 617 // This means the '$' was not followed by a 618 // valid name. Try to interpret it as an 619 // end anchor then. If this also doesn't work 620 // (if we see a following character) then signal 621 // an error. 622 anchorEnd = TRUE; 623 break; 624 } 625 pos = pp.getIndex(); 626 // If this is a variable definition statement, 627 // then the LHS variable will be undefined. In 628 // that case appendVariableDef() will append the 629 // special placeholder char variableLimit-1. 630 varStart = buf.length(); 631 parser.appendVariableDef(name, buf, status); 632 varLimit = buf.length(); 633 } 634 } 635 break; 636 case DOT: 637 buf.append(parser.getDotStandIn(status)); 638 break; 639 case KLEENE_STAR: 640 case ONE_OR_MORE: 641 case ZERO_OR_ONE: 642 // Quantifiers. We handle single characters, quoted strings, 643 // variable references, and segments. 644 // a+ matches aaa 645 // 'foo'+ matches foofoofoo 646 // $v+ matches xyxyxy if $v == xy 647 // (seg)+ matches segsegseg 648 { 649 if (isSegment && buf.length() == bufStart) { 650 // The */+ immediately follows '(' 651 return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status); 652 } 653 654 int32_t qstart, qlimit; 655 // The */+ follows an isolated character or quote 656 // or variable reference 657 if (buf.length() == quoteLimit) { 658 // The */+ follows a 'quoted string' 659 qstart = quoteStart; 660 qlimit = quoteLimit; 661 } else if (buf.length() == varLimit) { 662 // The */+ follows a $variableReference 663 qstart = varStart; 664 qlimit = varLimit; 665 } else { 666 // The */+ follows a single character, possibly 667 // a segment standin 668 qstart = buf.length() - 1; 669 qlimit = qstart + 1; 670 } 671 672 UnicodeFunctor *m = 673 new StringMatcher(buf, qstart, qlimit, 0, *parser.curData); 674 if (m == NULL) { 675 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 676 } 677 int32_t min = 0; 678 int32_t max = Quantifier::MAX; 679 switch (c) { 680 case ONE_OR_MORE: 681 min = 1; 682 break; 683 case ZERO_OR_ONE: 684 min = 0; 685 max = 1; 686 break; 687 // case KLEENE_STAR: 688 // do nothing -- min, max already set 689 } 690 m = new Quantifier(m, min, max); 691 if (m == NULL) { 692 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 693 } 694 buf.truncate(qstart); 695 buf.append(parser.generateStandInFor(m, status)); 696 } 697 break; 698 699 //------------------------------------------------------ 700 // Elements allowed ONLY WITHIN segments 701 //------------------------------------------------------ 702 case SEGMENT_CLOSE: 703 // assert(isSegment); 704 // We're done parsing a segment. 705 done = TRUE; 706 break; 707 708 //------------------------------------------------------ 709 // Elements allowed ONLY OUTSIDE segments 710 //------------------------------------------------------ 711 case CONTEXT_ANTE: 712 if (ante >= 0) { 713 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status); 714 } 715 ante = buf.length(); 716 break; 717 case CONTEXT_POST: 718 if (post >= 0) { 719 return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status); 720 } 721 post = buf.length(); 722 break; 723 case CURSOR_POS: 724 if (cursor >= 0) { 725 return syntaxError(U_MULTIPLE_CURSORS, rule, start, status); 726 } 727 cursor = buf.length(); 728 break; 729 case CURSOR_OFFSET: 730 if (cursorOffset < 0) { 731 if (buf.length() > 0) { 732 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 733 } 734 --cursorOffset; 735 } else if (cursorOffset > 0) { 736 if (buf.length() != cursorOffsetPos || cursor >= 0) { 737 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 738 } 739 ++cursorOffset; 740 } else { 741 if (cursor == 0 && buf.length() == 0) { 742 cursorOffset = -1; 743 } else if (cursor < 0) { 744 cursorOffsetPos = buf.length(); 745 cursorOffset = 1; 746 } else { 747 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 748 } 749 } 750 break; 751 752 753 //------------------------------------------------------ 754 // Non-special characters 755 //------------------------------------------------------ 756 default: 757 // Disallow unquoted characters other than [0-9A-Za-z] 758 // in the printable ASCII range. These characters are 759 // reserved for possible future use. 760 if (c >= 0x0021 && c <= 0x007E && 761 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 762 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 763 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) { 764 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 765 } 766 buf.append(c); 767 break; 768 } 769 } 770 771 return pos; 772} 773 774/** 775 * Remove context. 776 */ 777void RuleHalf::removeContext() { 778 //text = text.substring(ante < 0 ? 0 : ante, 779 // post < 0 ? text.length() : post); 780 if (post >= 0) { 781 text.remove(post); 782 } 783 if (ante >= 0) { 784 text.removeBetween(0, ante); 785 } 786 ante = post = -1; 787 anchorStart = anchorEnd = FALSE; 788} 789 790/** 791 * Return true if this half looks like valid output, that is, does not 792 * contain quantifiers or other special input-only elements. 793 */ 794UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { 795 for (int32_t i=0; i<text.length(); ) { 796 UChar32 c = text.char32At(i); 797 i += U16_LENGTH(c); 798 if (!transParser.parseData->isReplacer(c)) { 799 return FALSE; 800 } 801 } 802 return TRUE; 803} 804 805/** 806 * Return true if this half looks like valid input, that is, does not 807 * contain functions or other special output-only elements. 808 */ 809UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { 810 for (int32_t i=0; i<text.length(); ) { 811 UChar32 c = text.char32At(i); 812 i += U16_LENGTH(c); 813 if (!transParser.parseData->isMatcher(c)) { 814 return FALSE; 815 } 816 } 817 return TRUE; 818} 819 820//---------------------------------------------------------------------- 821// PUBLIC API 822//---------------------------------------------------------------------- 823 824/** 825 * Constructor. 826 */ 827TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : 828dataVector(statusReturn), 829idBlockVector(statusReturn), 830variablesVector(statusReturn), 831segmentObjects(statusReturn) 832{ 833 idBlockVector.setDeleter(uprv_deleteUObject); 834 curData = NULL; 835 compoundFilter = NULL; 836 parseData = NULL; 837 variableNames.setValueDeleter(uprv_deleteUObject); 838} 839 840/** 841 * Destructor. 842 */ 843TransliteratorParser::~TransliteratorParser() { 844 while (!dataVector.isEmpty()) 845 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 846 delete compoundFilter; 847 delete parseData; 848 while (!variablesVector.isEmpty()) 849 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 850} 851 852void 853TransliteratorParser::parse(const UnicodeString& rules, 854 UTransDirection transDirection, 855 UParseError& pe, 856 UErrorCode& ec) { 857 if (U_SUCCESS(ec)) { 858 parseRules(rules, transDirection, ec); 859 pe = parseError; 860 } 861} 862 863/** 864 * Return the compound filter parsed by parse(). Caller owns result. 865 */ 866UnicodeSet* TransliteratorParser::orphanCompoundFilter() { 867 UnicodeSet* f = compoundFilter; 868 compoundFilter = NULL; 869 return f; 870} 871 872//---------------------------------------------------------------------- 873// Private implementation 874//---------------------------------------------------------------------- 875 876/** 877 * Parse the given string as a sequence of rules, separated by newline 878 * characters ('\n'), and cause this object to implement those rules. Any 879 * previous rules are discarded. Typically this method is called exactly 880 * once, during construction. 881 * @exception IllegalArgumentException if there is a syntax error in the 882 * rules 883 */ 884void TransliteratorParser::parseRules(const UnicodeString& rule, 885 UTransDirection theDirection, 886 UErrorCode& status) 887{ 888 // Clear error struct 889 uprv_memset(&parseError, 0, sizeof(parseError)); 890 parseError.line = parseError.offset = -1; 891 892 UBool parsingIDs = TRUE; 893 int32_t ruleCount = 0; 894 895 while (!dataVector.isEmpty()) { 896 delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); 897 } 898 if (U_FAILURE(status)) { 899 return; 900 } 901 902 idBlockVector.removeAllElements(); 903 curData = NULL; 904 direction = theDirection; 905 ruleCount = 0; 906 907 delete compoundFilter; 908 compoundFilter = NULL; 909 910 while (!variablesVector.isEmpty()) { 911 delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); 912 } 913 variableNames.removeAll(); 914 parseData = new ParseData(0, &variablesVector, &variableNames); 915 if (parseData == NULL) { 916 status = U_MEMORY_ALLOCATION_ERROR; 917 return; 918 } 919 920 dotStandIn = (UChar) -1; 921 922 UnicodeString *tempstr = NULL; // used for memory allocation error checking 923 UnicodeString str; // scratch 924 UnicodeString idBlockResult; 925 int32_t pos = 0; 926 int32_t limit = rule.length(); 927 928 // The compound filter offset is an index into idBlockResult. 929 // If it is 0, then the compound filter occurred at the start, 930 // and it is the offset to the _start_ of the compound filter 931 // pattern. Otherwise it is the offset to the _limit_ of the 932 // compound filter pattern within idBlockResult. 933 compoundFilter = NULL; 934 int32_t compoundFilterOffset = -1; 935 936 while (pos < limit && U_SUCCESS(status)) { 937 UChar c = rule.charAt(pos++); 938 if (PatternProps::isWhiteSpace(c)) { 939 // Ignore leading whitespace. 940 continue; 941 } 942 // Skip lines starting with the comment character 943 if (c == RULE_COMMENT_CHAR) { 944 pos = rule.indexOf((UChar)0x000A /*\n*/, pos) + 1; 945 if (pos == 0) { 946 break; // No "\n" found; rest of rule is a commnet 947 } 948 continue; // Either fall out or restart with next line 949 } 950 951 // skip empty rules 952 if (c == END_OF_RULE) 953 continue; 954 955 // keep track of how many rules we've seen 956 ++ruleCount; 957 958 // We've found the start of a rule or ID. c is its first 959 // character, and pos points past c. 960 --pos; 961 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 962 // chars left. 963 if ((pos + ID_TOKEN_LEN + 1) <= limit && 964 rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { 965 pos += ID_TOKEN_LEN; 966 c = rule.charAt(pos); 967 while (PatternProps::isWhiteSpace(c) && pos < limit) { 968 ++pos; 969 c = rule.charAt(pos); 970 } 971 972 int32_t p = pos; 973 974 if (!parsingIDs) { 975 if (curData != NULL) { 976 if (direction == UTRANS_FORWARD) 977 dataVector.addElement(curData, status); 978 else 979 dataVector.insertElementAt(curData, 0, status); 980 curData = NULL; 981 } 982 parsingIDs = TRUE; 983 } 984 985 TransliteratorIDParser::SingleID* id = 986 TransliteratorIDParser::parseSingleID(rule, p, direction, status); 987 if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) { 988 // Successful ::ID parse. 989 990 if (direction == UTRANS_FORWARD) { 991 idBlockResult.append(id->canonID).append(END_OF_RULE); 992 } else { 993 idBlockResult.insert(0, END_OF_RULE); 994 idBlockResult.insert(0, id->canonID); 995 } 996 997 } else { 998 // Couldn't parse an ID. Try to parse a global filter 999 int32_t withParens = -1; 1000 UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL); 1001 if (f != NULL) { 1002 if (ICU_Utility::parseChar(rule, p, END_OF_RULE) 1003 && (direction == UTRANS_FORWARD) == (withParens == 0)) 1004 { 1005 if (compoundFilter != NULL) { 1006 // Multiple compound filters 1007 syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status); 1008 delete f; 1009 } else { 1010 compoundFilter = f; 1011 compoundFilterOffset = ruleCount; 1012 } 1013 } else { 1014 delete f; 1015 } 1016 } else { 1017 // Invalid ::id 1018 // Can be parsed as neither an ID nor a global filter 1019 syntaxError(U_INVALID_ID, rule, pos, status); 1020 } 1021 } 1022 delete id; 1023 pos = p; 1024 } else { 1025 if (parsingIDs) { 1026 tempstr = new UnicodeString(idBlockResult); 1027 // NULL pointer check 1028 if (tempstr == NULL) { 1029 status = U_MEMORY_ALLOCATION_ERROR; 1030 return; 1031 } 1032 if (direction == UTRANS_FORWARD) 1033 idBlockVector.addElement(tempstr, status); 1034 else 1035 idBlockVector.insertElementAt(tempstr, 0, status); 1036 idBlockResult.remove(); 1037 parsingIDs = FALSE; 1038 curData = new TransliterationRuleData(status); 1039 // NULL pointer check 1040 if (curData == NULL) { 1041 status = U_MEMORY_ALLOCATION_ERROR; 1042 return; 1043 } 1044 parseData->data = curData; 1045 1046 // By default, rules use part of the private use area 1047 // E000..F8FF for variables and other stand-ins. Currently 1048 // the range F000..F8FF is typically sufficient. The 'use 1049 // variable range' pragma allows rule sets to modify this. 1050 setVariableRange(0xF000, 0xF8FF, status); 1051 } 1052 1053 if (resemblesPragma(rule, pos, limit)) { 1054 int32_t ppp = parsePragma(rule, pos, limit, status); 1055 if (ppp < 0) { 1056 syntaxError(U_MALFORMED_PRAGMA, rule, pos, status); 1057 } 1058 pos = ppp; 1059 // Parse a rule 1060 } else { 1061 pos = parseRule(rule, pos, limit, status); 1062 } 1063 } 1064 } 1065 1066 if (parsingIDs && idBlockResult.length() > 0) { 1067 tempstr = new UnicodeString(idBlockResult); 1068 // NULL pointer check 1069 if (tempstr == NULL) { 1070 status = U_MEMORY_ALLOCATION_ERROR; 1071 return; 1072 } 1073 if (direction == UTRANS_FORWARD) 1074 idBlockVector.addElement(tempstr, status); 1075 else 1076 idBlockVector.insertElementAt(tempstr, 0, status); 1077 } 1078 else if (!parsingIDs && curData != NULL) { 1079 if (direction == UTRANS_FORWARD) 1080 dataVector.addElement(curData, status); 1081 else 1082 dataVector.insertElementAt(curData, 0, status); 1083 } 1084 1085 if (U_SUCCESS(status)) { 1086 // Convert the set vector to an array 1087 int32_t i, dataVectorSize = dataVector.size(); 1088 for (i = 0; i < dataVectorSize; i++) { 1089 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1090 data->variablesLength = variablesVector.size(); 1091 if (data->variablesLength == 0) { 1092 data->variables = 0; 1093 } else { 1094 data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*)); 1095 // NULL pointer check 1096 if (data->variables == NULL) { 1097 status = U_MEMORY_ALLOCATION_ERROR; 1098 return; 1099 } 1100 data->variablesAreOwned = (i == 0); 1101 } 1102 1103 for (int32_t j = 0; j < data->variablesLength; j++) { 1104 data->variables[j] = 1105 ((UnicodeSet*)variablesVector.elementAt(j)); 1106 } 1107 1108 data->variableNames.removeAll(); 1109 int32_t pos = UHASH_FIRST; 1110 const UHashElement* he = variableNames.nextElement(pos); 1111 while (he != NULL) { 1112 UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone()); 1113 if (tempus == NULL) { 1114 status = U_MEMORY_ALLOCATION_ERROR; 1115 return; 1116 } 1117 data->variableNames.put(*((UnicodeString*)(he->key.pointer)), 1118 tempus, status); 1119 he = variableNames.nextElement(pos); 1120 } 1121 } 1122 variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed 1123 1124 // Index the rules 1125 if (compoundFilter != NULL) { 1126 if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) || 1127 (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) { 1128 status = U_MISPLACED_COMPOUND_FILTER; 1129 } 1130 } 1131 1132 for (i = 0; i < dataVectorSize; i++) { 1133 TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); 1134 data->ruleSet.freeze(parseError, status); 1135 } 1136 if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) { 1137 idBlockVector.removeElementAt(0); 1138 } 1139 } 1140} 1141 1142/** 1143 * Set the variable range to [start, end] (inclusive). 1144 */ 1145void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { 1146 if (start > end || start < 0 || end > 0xFFFF) { 1147 status = U_MALFORMED_PRAGMA; 1148 return; 1149 } 1150 1151 curData->variablesBase = (UChar) start; 1152 if (dataVector.size() == 0) { 1153 variableNext = (UChar) start; 1154 variableLimit = (UChar) (end + 1); 1155 } 1156} 1157 1158/** 1159 * Assert that the given character is NOT within the variable range. 1160 * If it is, return FALSE. This is neccesary to ensure that the 1161 * variable range does not overlap characters used in a rule. 1162 */ 1163UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { 1164 return !(ch >= curData->variablesBase && ch < variableLimit); 1165} 1166 1167/** 1168 * Set the maximum backup to 'backup', in response to a pragma 1169 * statement. 1170 */ 1171void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { 1172 //TODO Finish 1173} 1174 1175/** 1176 * Begin normalizing all rules using the given mode, in response 1177 * to a pragma statement. 1178 */ 1179void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { 1180 //TODO Finish 1181} 1182 1183static const UChar PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use " 1184 1185static const UChar PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;" 1186 1187static const UChar PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;" 1188 1189static const UChar PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;" 1190 1191static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;" 1192 1193/** 1194 * Return true if the given rule looks like a pragma. 1195 * @param pos offset to the first non-whitespace character 1196 * of the rule. 1197 * @param limit pointer past the last character of the rule. 1198 */ 1199UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { 1200 // Must start with /use\s/i 1201 return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_USE, 4), NULL) >= 0; 1202} 1203 1204/** 1205 * Parse a pragma. This method assumes resemblesPragma() has 1206 * already returned true. 1207 * @param pos offset to the first non-whitespace character 1208 * of the rule. 1209 * @param limit pointer past the last character of the rule. 1210 * @return the position index after the final ';' of the pragma, 1211 * or -1 on failure. 1212 */ 1213int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1214 int32_t array[2]; 1215 1216 // resemblesPragma() has already returned true, so we 1217 // know that pos points to /use\s/i; we can skip 4 characters 1218 // immediately 1219 pos += 4; 1220 1221 // Here are the pragmas we recognize: 1222 // use variable range 0xE000 0xEFFF; 1223 // use maximum backup 16; 1224 // use nfd rules; 1225 // use nfc rules; 1226 int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_VARIABLE_RANGE, -1), array); 1227 if (p >= 0) { 1228 setVariableRange(array[0], array[1], status); 1229 return p; 1230 } 1231 1232 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_MAXIMUM_BACKUP, -1), array); 1233 if (p >= 0) { 1234 pragmaMaximumBackup(array[0]); 1235 return p; 1236 } 1237 1238 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFD_RULES, -1), NULL); 1239 if (p >= 0) { 1240 pragmaNormalizeRules(UNORM_NFD); 1241 return p; 1242 } 1243 1244 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFC_RULES, -1), NULL); 1245 if (p >= 0) { 1246 pragmaNormalizeRules(UNORM_NFC); 1247 return p; 1248 } 1249 1250 // Syntax error: unable to parse pragma 1251 return -1; 1252} 1253 1254/** 1255 * MAIN PARSER. Parse the next rule in the given rule string, starting 1256 * at pos. Return the index after the last character parsed. Do not 1257 * parse characters at or after limit. 1258 * 1259 * Important: The character at pos must be a non-whitespace character 1260 * that is not the comment character. 1261 * 1262 * This method handles quoting, escaping, and whitespace removal. It 1263 * parses the end-of-rule character. It recognizes context and cursor 1264 * indicators. Once it does a lexical breakdown of the rule at pos, it 1265 * creates a rule object and adds it to our rule list. 1266 */ 1267int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1268 // Locate the left side, operator, and right side 1269 int32_t start = pos; 1270 UChar op = 0; 1271 int32_t i; 1272 1273 // Set up segments data 1274 segmentStandins.truncate(0); 1275 segmentObjects.removeAllElements(); 1276 1277 // Use pointers to automatics to make swapping possible. 1278 RuleHalf _left(*this), _right(*this); 1279 RuleHalf* left = &_left; 1280 RuleHalf* right = &_right; 1281 1282 undefinedVariableName.remove(); 1283 pos = left->parse(rule, pos, limit, status); 1284 if (U_FAILURE(status)) { 1285 return start; 1286 } 1287 1288 if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) { 1289 return syntaxError(U_MISSING_OPERATOR, rule, start, status); 1290 } 1291 ++pos; 1292 1293 // Found an operator char. Check for forward-reverse operator. 1294 if (op == REVERSE_RULE_OP && 1295 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1296 ++pos; 1297 op = FWDREV_RULE_OP; 1298 } 1299 1300 // Translate alternate op characters. 1301 switch (op) { 1302 case ALT_FORWARD_RULE_OP: 1303 op = FORWARD_RULE_OP; 1304 break; 1305 case ALT_REVERSE_RULE_OP: 1306 op = REVERSE_RULE_OP; 1307 break; 1308 case ALT_FWDREV_RULE_OP: 1309 op = FWDREV_RULE_OP; 1310 break; 1311 } 1312 1313 pos = right->parse(rule, pos, limit, status); 1314 if (U_FAILURE(status)) { 1315 return start; 1316 } 1317 1318 if (pos < limit) { 1319 if (rule.charAt(--pos) == END_OF_RULE) { 1320 ++pos; 1321 } else { 1322 // RuleHalf parser must have terminated at an operator 1323 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 1324 } 1325 } 1326 1327 if (op == VARIABLE_DEF_OP) { 1328 // LHS is the name. RHS is a single character, either a literal 1329 // or a set (already parsed). If RHS is longer than one 1330 // character, it is either a multi-character string, or multiple 1331 // sets, or a mixture of chars and sets -- syntax error. 1332 1333 // We expect to see a single undefined variable (the one being 1334 // defined). 1335 if (undefinedVariableName.length() == 0) { 1336 // "Missing '$' or duplicate definition" 1337 return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status); 1338 } 1339 if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) { 1340 // "Malformed LHS" 1341 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1342 } 1343 if (left->anchorStart || left->anchorEnd || 1344 right->anchorStart || right->anchorEnd) { 1345 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1346 } 1347 // We allow anything on the right, including an empty string. 1348 UnicodeString* value = new UnicodeString(right->text); 1349 // NULL pointer check 1350 if (value == NULL) { 1351 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1352 } 1353 variableNames.put(undefinedVariableName, value, status); 1354 ++variableLimit; 1355 return pos; 1356 } 1357 1358 // If this is not a variable definition rule, we shouldn't have 1359 // any undefined variable names. 1360 if (undefinedVariableName.length() != 0) { 1361 return syntaxError(// "Undefined variable $" + undefinedVariableName, 1362 U_UNDEFINED_VARIABLE, 1363 rule, start, status); 1364 } 1365 1366 // Verify segments 1367 if (segmentStandins.length() > segmentObjects.size()) { 1368 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); 1369 } 1370 for (i=0; i<segmentStandins.length(); ++i) { 1371 if (segmentStandins.charAt(i) == 0) { 1372 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1373 } 1374 } 1375 for (i=0; i<segmentObjects.size(); ++i) { 1376 if (segmentObjects.elementAt(i) == NULL) { 1377 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1378 } 1379 } 1380 1381 // If the direction we want doesn't match the rule 1382 // direction, do nothing. 1383 if (op != FWDREV_RULE_OP && 1384 ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) { 1385 return pos; 1386 } 1387 1388 // Transform the rule into a forward rule by swapping the 1389 // sides if necessary. 1390 if (direction == UTRANS_REVERSE) { 1391 left = &_right; 1392 right = &_left; 1393 } 1394 1395 // Remove non-applicable elements in forward-reverse 1396 // rules. Bidirectional rules ignore elements that do not 1397 // apply. 1398 if (op == FWDREV_RULE_OP) { 1399 right->removeContext(); 1400 left->cursor = -1; 1401 left->cursorOffset = 0; 1402 } 1403 1404 // Normalize context 1405 if (left->ante < 0) { 1406 left->ante = 0; 1407 } 1408 if (left->post < 0) { 1409 left->post = left->text.length(); 1410 } 1411 1412 // Context is only allowed on the input side. Cursors are only 1413 // allowed on the output side. Segment delimiters can only appear 1414 // on the left, and references on the right. Cursor offset 1415 // cannot appear without an explicit cursor. Cursor offset 1416 // cannot place the cursor outside the limits of the context. 1417 // Anchors are only allowed on the input side. 1418 if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || 1419 (right->cursorOffset != 0 && right->cursor < 0) || 1420 // - The following two checks were used to ensure that the 1421 // - the cursor offset stayed within the ante- or postcontext. 1422 // - However, with the addition of quantifiers, we have to 1423 // - allow arbitrary cursor offsets and do runtime checking. 1424 //(right->cursorOffset > (left->text.length() - left->post)) || 1425 //(-right->cursorOffset > left->ante) || 1426 right->anchorStart || right->anchorEnd || 1427 !left->isValidInput(*this) || !right->isValidOutput(*this) || 1428 left->ante > left->post) { 1429 1430 return syntaxError(U_MALFORMED_RULE, rule, start, status); 1431 } 1432 1433 // Flatten segment objects vector to an array 1434 UnicodeFunctor** segmentsArray = NULL; 1435 if (segmentObjects.size() > 0) { 1436 segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *)); 1437 // Null pointer check 1438 if (segmentsArray == NULL) { 1439 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1440 } 1441 segmentObjects.toArray((void**) segmentsArray); 1442 } 1443 TransliterationRule* temptr = new TransliterationRule( 1444 left->text, left->ante, left->post, 1445 right->text, right->cursor, right->cursorOffset, 1446 segmentsArray, 1447 segmentObjects.size(), 1448 left->anchorStart, left->anchorEnd, 1449 curData, 1450 status); 1451 //Null pointer check 1452 if (temptr == NULL) { 1453 uprv_free(segmentsArray); 1454 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1455 } 1456 1457 curData->ruleSet.addRule(temptr, status); 1458 1459 return pos; 1460} 1461 1462/** 1463 * Called by main parser upon syntax error. Search the rule string 1464 * for the probable end of the rule. Of course, if the error is that 1465 * the end of rule marker is missing, then the rule end will not be found. 1466 * In any case the rule start will be correctly reported. 1467 * @param msg error description 1468 * @param rule pattern string 1469 * @param start position of first character of current rule 1470 */ 1471int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, 1472 const UnicodeString& rule, 1473 int32_t pos, 1474 UErrorCode& status) 1475{ 1476 parseError.offset = pos; 1477 parseError.line = 0 ; /* we are not using line numbers */ 1478 1479 // for pre-context 1480 const int32_t LEN = U_PARSE_CONTEXT_LEN - 1; 1481 int32_t start = uprv_max(pos - LEN, 0); 1482 int32_t stop = pos; 1483 1484 rule.extract(start,stop-start,parseError.preContext); 1485 //null terminate the buffer 1486 parseError.preContext[stop-start] = 0; 1487 1488 //for post-context 1489 start = pos; 1490 stop = uprv_min(pos + LEN, rule.length()); 1491 1492 rule.extract(start,stop-start,parseError.postContext); 1493 //null terminate the buffer 1494 parseError.postContext[stop-start]= 0; 1495 1496 status = (UErrorCode)parseErrorCode; 1497 return pos; 1498 1499} 1500 1501/** 1502 * Parse a UnicodeSet out, store it, and return the stand-in character 1503 * used to represent it. 1504 */ 1505UChar TransliteratorParser::parseSet(const UnicodeString& rule, 1506 ParsePosition& pos, 1507 UErrorCode& status) { 1508 UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status); 1509 // Null pointer check 1510 if (set == NULL) { 1511 status = U_MEMORY_ALLOCATION_ERROR; 1512 return (UChar)0x0000; // Return empty character with error. 1513 } 1514 set->compact(); 1515 return generateStandInFor(set, status); 1516} 1517 1518/** 1519 * Generate and return a stand-in for a new UnicodeFunctor. Store 1520 * the matcher (adopt it). 1521 */ 1522UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { 1523 // assert(obj != null); 1524 1525 // Look up previous stand-in, if any. This is a short list 1526 // (typical n is 0, 1, or 2); linear search is optimal. 1527 for (int32_t i=0; i<variablesVector.size(); ++i) { 1528 if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison 1529 return (UChar) (curData->variablesBase + i); 1530 } 1531 } 1532 1533 if (variableNext >= variableLimit) { 1534 delete adopted; 1535 status = U_VARIABLE_RANGE_EXHAUSTED; 1536 return 0; 1537 } 1538 variablesVector.addElement(adopted, status); 1539 return variableNext++; 1540} 1541 1542/** 1543 * Return the standin for segment seg (1-based). 1544 */ 1545UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { 1546 // Special character used to indicate an empty spot 1547 UChar empty = curData->variablesBase - 1; 1548 while (segmentStandins.length() < seg) { 1549 segmentStandins.append(empty); 1550 } 1551 UChar c = segmentStandins.charAt(seg-1); 1552 if (c == empty) { 1553 if (variableNext >= variableLimit) { 1554 status = U_VARIABLE_RANGE_EXHAUSTED; 1555 return 0; 1556 } 1557 c = variableNext++; 1558 // Set a placeholder in the master variables vector that will be 1559 // filled in later by setSegmentObject(). We know that we will get 1560 // called first because setSegmentObject() will call us. 1561 variablesVector.addElement((void*) NULL, status); 1562 segmentStandins.setCharAt(seg-1, c); 1563 } 1564 return c; 1565} 1566 1567/** 1568 * Set the object for segment seg (1-based). 1569 */ 1570void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { 1571 // Since we call parseSection() recursively, nested 1572 // segments will result in segment i+1 getting parsed 1573 // and stored before segment i; be careful with the 1574 // vector handling here. 1575 if (segmentObjects.size() < seg) { 1576 segmentObjects.setSize(seg, status); 1577 } 1578 int32_t index = getSegmentStandin(seg, status) - curData->variablesBase; 1579 if (segmentObjects.elementAt(seg-1) != NULL || 1580 variablesVector.elementAt(index) != NULL) { 1581 // should never happen 1582 status = U_INTERNAL_TRANSLITERATOR_ERROR; 1583 return; 1584 } 1585 segmentObjects.setElementAt(adopted, seg-1); 1586 variablesVector.setElementAt(adopted, index); 1587} 1588 1589/** 1590 * Return the stand-in for the dot set. It is allocated the first 1591 * time and reused thereafter. 1592 */ 1593UChar TransliteratorParser::getDotStandIn(UErrorCode& status) { 1594 if (dotStandIn == (UChar) -1) { 1595 UnicodeSet* tempus = new UnicodeSet(UnicodeString(TRUE, DOT_SET, -1), status); 1596 // Null pointer check. 1597 if (tempus == NULL) { 1598 status = U_MEMORY_ALLOCATION_ERROR; 1599 return (UChar)0x0000; 1600 } 1601 dotStandIn = generateStandInFor(tempus, status); 1602 } 1603 return dotStandIn; 1604} 1605 1606/** 1607 * Append the value of the given variable name to the given 1608 * UnicodeString. 1609 */ 1610void TransliteratorParser::appendVariableDef(const UnicodeString& name, 1611 UnicodeString& buf, 1612 UErrorCode& status) { 1613 const UnicodeString* s = (const UnicodeString*) variableNames.get(name); 1614 if (s == NULL) { 1615 // We allow one undefined variable so that variable definition 1616 // statements work. For the first undefined variable we return 1617 // the special placeholder variableLimit-1, and save the variable 1618 // name. 1619 if (undefinedVariableName.length() == 0) { 1620 undefinedVariableName = name; 1621 if (variableNext >= variableLimit) { 1622 // throw new RuntimeException("Private use variables exhausted"); 1623 status = U_ILLEGAL_ARGUMENT_ERROR; 1624 return; 1625 } 1626 buf.append((UChar) --variableLimit); 1627 } else { 1628 //throw new IllegalArgumentException("Undefined variable $" 1629 // + name); 1630 status = U_ILLEGAL_ARGUMENT_ERROR; 1631 return; 1632 } 1633 } else { 1634 buf.append(*s); 1635 } 1636} 1637 1638/** 1639 * Glue method to get around access restrictions in C++. 1640 */ 1641/*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 1642 return Transliterator::createBasicInstance(id, canonID); 1643}*/ 1644 1645U_NAMESPACE_END 1646 1647U_CAPI int32_t 1648utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) { 1649 U_NAMESPACE_USE 1650 1651 //const UChar *sourceStart = source; 1652 const UChar *targetStart = target; 1653 const UChar *sourceLimit = source+sourceLen; 1654 UChar *targetLimit = target+sourceLen; 1655 UChar32 c = 0; 1656 UBool quoted = FALSE; 1657 int32_t index; 1658 1659 uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR); 1660 1661 /* read the rules into the buffer */ 1662 while (source < sourceLimit) 1663 { 1664 index=0; 1665 U16_NEXT_UNSAFE(source, index, c); 1666 source+=index; 1667 if(c == QUOTE) { 1668 quoted = (UBool)!quoted; 1669 } 1670 else if (!quoted) { 1671 if (c == RULE_COMMENT_CHAR) { 1672 /* skip comments and all preceding spaces */ 1673 while (targetStart < target && *(target - 1) == 0x0020) { 1674 target--; 1675 } 1676 do { 1677 if (source == sourceLimit) { 1678 c = U_SENTINEL; 1679 break; 1680 } 1681 c = *(source++); 1682 } 1683 while (c != CR && c != LF); 1684 if (c < 0) { 1685 break; 1686 } 1687 } 1688 else if (c == ESCAPE && source < sourceLimit) { 1689 UChar32 c2 = *source; 1690 if (c2 == CR || c2 == LF) { 1691 /* A backslash at the end of a line. */ 1692 /* Since we're stripping lines, ignore the backslash. */ 1693 source++; 1694 continue; 1695 } 1696 if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */ 1697 int32_t escapeOffset = 0; 1698 UnicodeString escapedStr(source, 5); 1699 c2 = escapedStr.unescapeAt(escapeOffset); 1700 1701 if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0) 1702 { 1703 *status = U_PARSE_ERROR; 1704 return 0; 1705 } 1706 if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) { 1707 /* It was escaped for a reason. Write what it was suppose to be. */ 1708 source+=5; 1709 c = c2; 1710 } 1711 } 1712 else if (c2 == QUOTE) { 1713 /* \' seen. Make sure we don't do anything when we see it again. */ 1714 quoted = (UBool)!quoted; 1715 } 1716 } 1717 } 1718 if (c == CR || c == LF) 1719 { 1720 /* ignore spaces carriage returns, and all leading spaces on the next line. 1721 * and line feed unless in the form \uXXXX 1722 */ 1723 quoted = FALSE; 1724 while (source < sourceLimit) { 1725 c = *(source); 1726 if (c != CR && c != LF && c != 0x0020) { 1727 break; 1728 } 1729 source++; 1730 } 1731 continue; 1732 } 1733 1734 /* Append UChar * after dissembling if c > 0xffff*/ 1735 index=0; 1736 U16_APPEND_UNSAFE(target, index, c); 1737 target+=index; 1738 } 1739 if (target < targetLimit) { 1740 *target = 0; 1741 } 1742 return (int32_t)(target-targetStart); 1743} 1744 1745#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1746