1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4****************************************************************************** 5* Copyright (C) 1997-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7****************************************************************************** 8* file name: nfrule.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* Modification history 14* Date Name Comments 15* 10/11/2001 Doug Ported from ICU4J 16*/ 17 18#include "nfrule.h" 19 20#if U_HAVE_RBNF 21 22#include "unicode/localpointer.h" 23#include "unicode/rbnf.h" 24#include "unicode/tblcoll.h" 25#include "unicode/plurfmt.h" 26#include "unicode/upluralrules.h" 27#include "unicode/coleitr.h" 28#include "unicode/uchar.h" 29#include "nfrs.h" 30#include "nfrlist.h" 31#include "nfsubs.h" 32#include "patternprops.h" 33 34U_NAMESPACE_BEGIN 35 36NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status) 37 : baseValue((int32_t)0) 38 , radix(10) 39 , exponent(0) 40 , decimalPoint(0) 41 , ruleText(_ruleText) 42 , sub1(NULL) 43 , sub2(NULL) 44 , formatter(_rbnf) 45 , rulePatternFormat(NULL) 46{ 47 if (!ruleText.isEmpty()) { 48 parseRuleDescriptor(ruleText, status); 49 } 50} 51 52NFRule::~NFRule() 53{ 54 if (sub1 != sub2) { 55 delete sub2; 56 sub2 = NULL; 57 } 58 delete sub1; 59 sub1 = NULL; 60 delete rulePatternFormat; 61 rulePatternFormat = NULL; 62} 63 64static const UChar gLeftBracket = 0x005b; 65static const UChar gRightBracket = 0x005d; 66static const UChar gColon = 0x003a; 67static const UChar gZero = 0x0030; 68static const UChar gNine = 0x0039; 69static const UChar gSpace = 0x0020; 70static const UChar gSlash = 0x002f; 71static const UChar gGreaterThan = 0x003e; 72static const UChar gLessThan = 0x003c; 73static const UChar gComma = 0x002c; 74static const UChar gDot = 0x002e; 75static const UChar gTick = 0x0027; 76//static const UChar gMinus = 0x002d; 77static const UChar gSemicolon = 0x003b; 78static const UChar gX = 0x0078; 79 80static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ 81static const UChar gInf[] = {0x49, 0x6E, 0x66, 0}; /* "Inf" */ 82static const UChar gNaN[] = {0x4E, 0x61, 0x4E, 0}; /* "NaN" */ 83 84static const UChar gDollarOpenParenthesis[] = {0x24, 0x28, 0}; /* "$(" */ 85static const UChar gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */ 86 87static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ 88static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ 89static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ 90static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ 91static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ 92static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ 93static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ 94static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ 95static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ 96static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ 97static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ 98static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ 99 100static const UChar * const RULE_PREFIXES[] = { 101 gLessLess, gLessPercent, gLessHash, gLessZero, 102 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, 103 gEqualPercent, gEqualHash, gEqualZero, NULL 104}; 105 106void 107NFRule::makeRules(UnicodeString& description, 108 NFRuleSet *owner, 109 const NFRule *predecessor, 110 const RuleBasedNumberFormat *rbnf, 111 NFRuleList& rules, 112 UErrorCode& status) 113{ 114 // we know we're making at least one rule, so go ahead and 115 // new it up and initialize its basevalue and divisor 116 // (this also strips the rule descriptor, if any, off the 117 // descripton string) 118 NFRule* rule1 = new NFRule(rbnf, description, status); 119 /* test for NULL */ 120 if (rule1 == 0) { 121 status = U_MEMORY_ALLOCATION_ERROR; 122 return; 123 } 124 description = rule1->ruleText; 125 126 // check the description to see whether there's text enclosed 127 // in brackets 128 int32_t brack1 = description.indexOf(gLeftBracket); 129 int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket); 130 131 // if the description doesn't contain a matched pair of brackets, 132 // or if it's of a type that doesn't recognize bracketed text, 133 // then leave the description alone, initialize the rule's 134 // rule text and substitutions, and return that rule 135 if (brack2 < 0 || brack1 > brack2 136 || rule1->getType() == kProperFractionRule 137 || rule1->getType() == kNegativeNumberRule 138 || rule1->getType() == kInfinityRule 139 || rule1->getType() == kNaNRule) 140 { 141 rule1->extractSubstitutions(owner, description, predecessor, status); 142 } 143 else { 144 // if the description does contain a matched pair of brackets, 145 // then it's really shorthand for two rules (with one exception) 146 NFRule* rule2 = NULL; 147 UnicodeString sbuf; 148 149 // we'll actually only split the rule into two rules if its 150 // base value is an even multiple of its divisor (or it's one 151 // of the special rules) 152 if ((rule1->baseValue > 0 153 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) 154 || rule1->getType() == kImproperFractionRule 155 || rule1->getType() == kMasterRule) { 156 157 // if it passes that test, new up the second rule. If the 158 // rule set both rules will belong to is a fraction rule 159 // set, they both have the same base value; otherwise, 160 // increment the original rule's base value ("rule1" actually 161 // goes SECOND in the rule set's rule list) 162 rule2 = new NFRule(rbnf, UnicodeString(), status); 163 /* test for NULL */ 164 if (rule2 == 0) { 165 status = U_MEMORY_ALLOCATION_ERROR; 166 return; 167 } 168 if (rule1->baseValue >= 0) { 169 rule2->baseValue = rule1->baseValue; 170 if (!owner->isFractionRuleSet()) { 171 ++rule1->baseValue; 172 } 173 } 174 175 // if the description began with "x.x" and contains bracketed 176 // text, it describes both the improper fraction rule and 177 // the proper fraction rule 178 else if (rule1->getType() == kImproperFractionRule) { 179 rule2->setType(kProperFractionRule); 180 } 181 182 // if the description began with "x.0" and contains bracketed 183 // text, it describes both the master rule and the 184 // improper fraction rule 185 else if (rule1->getType() == kMasterRule) { 186 rule2->baseValue = rule1->baseValue; 187 rule1->setType(kImproperFractionRule); 188 } 189 190 // both rules have the same radix and exponent (i.e., the 191 // same divisor) 192 rule2->radix = rule1->radix; 193 rule2->exponent = rule1->exponent; 194 195 // rule2's rule text omits the stuff in brackets: initalize 196 // its rule text and substitutions accordingly 197 sbuf.append(description, 0, brack1); 198 if (brack2 + 1 < description.length()) { 199 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 200 } 201 rule2->extractSubstitutions(owner, sbuf, predecessor, status); 202 } 203 204 // rule1's text includes the text in the brackets but omits 205 // the brackets themselves: initialize _its_ rule text and 206 // substitutions accordingly 207 sbuf.setTo(description, 0, brack1); 208 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); 209 if (brack2 + 1 < description.length()) { 210 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 211 } 212 rule1->extractSubstitutions(owner, sbuf, predecessor, status); 213 214 // if we only have one rule, return it; if we have two, return 215 // a two-element array containing them (notice that rule2 goes 216 // BEFORE rule1 in the list: in all cases, rule2 OMITS the 217 // material in the brackets and rule1 INCLUDES the material 218 // in the brackets) 219 if (rule2 != NULL) { 220 if (rule2->baseValue >= kNoBase) { 221 rules.add(rule2); 222 } 223 else { 224 owner->setNonNumericalRule(rule2); 225 } 226 } 227 } 228 if (rule1->baseValue >= kNoBase) { 229 rules.add(rule1); 230 } 231 else { 232 owner->setNonNumericalRule(rule1); 233 } 234} 235 236/** 237 * This function parses the rule's rule descriptor (i.e., the base 238 * value and/or other tokens that precede the rule's rule text 239 * in the description) and sets the rule's base value, radix, and 240 * exponent according to the descriptor. (If the description doesn't 241 * include a rule descriptor, then this function sets everything to 242 * default values and the rule set sets the rule's real base value). 243 * @param description The rule's description 244 * @return If "description" included a rule descriptor, this is 245 * "description" with the descriptor and any trailing whitespace 246 * stripped off. Otherwise; it's "descriptor" unchangd. 247 */ 248void 249NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) 250{ 251 // the description consists of a rule descriptor and a rule body, 252 // separated by a colon. The rule descriptor is optional. If 253 // it's omitted, just set the base value to 0. 254 int32_t p = description.indexOf(gColon); 255 if (p != -1) { 256 // copy the descriptor out into its own string and strip it, 257 // along with any trailing whitespace, out of the original 258 // description 259 UnicodeString descriptor; 260 descriptor.setTo(description, 0, p); 261 262 ++p; 263 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { 264 ++p; 265 } 266 description.removeBetween(0, p); 267 268 // check first to see if the rule descriptor matches the token 269 // for one of the special rules. If it does, set the base 270 // value to the correct identifier value 271 int descriptorLength = descriptor.length(); 272 UChar firstChar = descriptor.charAt(0); 273 UChar lastChar = descriptor.charAt(descriptorLength - 1); 274 if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) { 275 // if the rule descriptor begins with a digit, it's a descriptor 276 // for a normal rule 277 // since we don't have Long.parseLong, and this isn't much work anyway, 278 // just build up the value as we encounter the digits. 279 int64_t val = 0; 280 p = 0; 281 UChar c = gSpace; 282 283 // begin parsing the descriptor: copy digits 284 // into "tempValue", skip periods, commas, and spaces, 285 // stop on a slash or > sign (or at the end of the string), 286 // and throw an exception on any other character 287 int64_t ll_10 = 10; 288 while (p < descriptorLength) { 289 c = descriptor.charAt(p); 290 if (c >= gZero && c <= gNine) { 291 val = val * ll_10 + (int32_t)(c - gZero); 292 } 293 else if (c == gSlash || c == gGreaterThan) { 294 break; 295 } 296 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 297 } 298 else { 299 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 300 status = U_PARSE_ERROR; 301 return; 302 } 303 ++p; 304 } 305 306 // we have the base value, so set it 307 setBaseValue(val, status); 308 309 // if we stopped the previous loop on a slash, we're 310 // now parsing the rule's radix. Again, accumulate digits 311 // in tempValue, skip punctuation, stop on a > mark, and 312 // throw an exception on anything else 313 if (c == gSlash) { 314 val = 0; 315 ++p; 316 int64_t ll_10 = 10; 317 while (p < descriptorLength) { 318 c = descriptor.charAt(p); 319 if (c >= gZero && c <= gNine) { 320 val = val * ll_10 + (int32_t)(c - gZero); 321 } 322 else if (c == gGreaterThan) { 323 break; 324 } 325 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 326 } 327 else { 328 // throw new IllegalArgumentException("Illegal character is rule descriptor"); 329 status = U_PARSE_ERROR; 330 return; 331 } 332 ++p; 333 } 334 335 // tempValue now contain's the rule's radix. Set it 336 // accordingly, and recalculate the rule's exponent 337 radix = (int32_t)val; 338 if (radix == 0) { 339 // throw new IllegalArgumentException("Rule can't have radix of 0"); 340 status = U_PARSE_ERROR; 341 } 342 343 exponent = expectedExponent(); 344 } 345 346 // if we stopped the previous loop on a > sign, then continue 347 // for as long as we still see > signs. For each one, 348 // decrement the exponent (unless the exponent is already 0). 349 // If we see another character before reaching the end of 350 // the descriptor, that's also a syntax error. 351 if (c == gGreaterThan) { 352 while (p < descriptor.length()) { 353 c = descriptor.charAt(p); 354 if (c == gGreaterThan && exponent > 0) { 355 --exponent; 356 } else { 357 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 358 status = U_PARSE_ERROR; 359 return; 360 } 361 ++p; 362 } 363 } 364 } 365 else if (0 == descriptor.compare(gMinusX, 2)) { 366 setType(kNegativeNumberRule); 367 } 368 else if (descriptorLength == 3) { 369 if (firstChar == gZero && lastChar == gX) { 370 setBaseValue(kProperFractionRule, status); 371 decimalPoint = descriptor.charAt(1); 372 } 373 else if (firstChar == gX && lastChar == gX) { 374 setBaseValue(kImproperFractionRule, status); 375 decimalPoint = descriptor.charAt(1); 376 } 377 else if (firstChar == gX && lastChar == gZero) { 378 setBaseValue(kMasterRule, status); 379 decimalPoint = descriptor.charAt(1); 380 } 381 else if (descriptor.compare(gNaN, 3) == 0) { 382 setBaseValue(kNaNRule, status); 383 } 384 else if (descriptor.compare(gInf, 3) == 0) { 385 setBaseValue(kInfinityRule, status); 386 } 387 } 388 } 389 // else use the default base value for now. 390 391 // finally, if the rule body begins with an apostrophe, strip it off 392 // (this is generally used to put whitespace at the beginning of 393 // a rule's rule text) 394 if (description.length() > 0 && description.charAt(0) == gTick) { 395 description.removeBetween(0, 1); 396 } 397 398 // return the description with all the stuff we've just waded through 399 // stripped off the front. It now contains just the rule body. 400 // return description; 401} 402 403/** 404* Searches the rule's rule text for the substitution tokens, 405* creates the substitutions, and removes the substitution tokens 406* from the rule's rule text. 407* @param owner The rule set containing this rule 408* @param predecessor The rule preseding this one in "owners" rule list 409* @param ownersOwner The RuleBasedFormat that owns this rule 410*/ 411void 412NFRule::extractSubstitutions(const NFRuleSet* ruleSet, 413 const UnicodeString &ruleText, 414 const NFRule* predecessor, 415 UErrorCode& status) 416{ 417 if (U_FAILURE(status)) { 418 return; 419 } 420 this->ruleText = ruleText; 421 sub1 = extractSubstitution(ruleSet, predecessor, status); 422 if (sub1 == NULL) { 423 // Small optimization. There is no need to create a redundant NullSubstitution. 424 sub2 = NULL; 425 } 426 else { 427 sub2 = extractSubstitution(ruleSet, predecessor, status); 428 } 429 int32_t pluralRuleStart = this->ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 430 int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? this->ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1); 431 if (pluralRuleEnd >= 0) { 432 int32_t endType = this->ruleText.indexOf(gComma, pluralRuleStart); 433 if (endType < 0) { 434 status = U_PARSE_ERROR; 435 return; 436 } 437 UnicodeString type(this->ruleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2)); 438 UPluralType pluralType; 439 if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) { 440 pluralType = UPLURAL_TYPE_CARDINAL; 441 } 442 else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) { 443 pluralType = UPLURAL_TYPE_ORDINAL; 444 } 445 else { 446 status = U_ILLEGAL_ARGUMENT_ERROR; 447 return; 448 } 449 rulePatternFormat = formatter->createPluralFormat(pluralType, 450 this->ruleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status); 451 } 452} 453 454/** 455* Searches the rule's rule text for the first substitution token, 456* creates a substitution based on it, and removes the token from 457* the rule's rule text. 458* @param owner The rule set containing this rule 459* @param predecessor The rule preceding this one in the rule set's 460* rule list 461* @param ownersOwner The RuleBasedNumberFormat that owns this rule 462* @return The newly-created substitution. This is never null; if 463* the rule text doesn't contain any substitution tokens, this will 464* be a NullSubstitution. 465*/ 466NFSubstitution * 467NFRule::extractSubstitution(const NFRuleSet* ruleSet, 468 const NFRule* predecessor, 469 UErrorCode& status) 470{ 471 NFSubstitution* result = NULL; 472 473 // search the rule's rule text for the first two characters of 474 // a substitution token 475 int32_t subStart = indexOfAnyRulePrefix(); 476 int32_t subEnd = subStart; 477 478 // if we didn't find one, create a null substitution positioned 479 // at the end of the rule text 480 if (subStart == -1) { 481 return NULL; 482 } 483 484 // special-case the ">>>" token, since searching for the > at the 485 // end will actually find the > in the middle 486 if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { 487 subEnd = subStart + 2; 488 489 // otherwise the substitution token ends with the same character 490 // it began with 491 } else { 492 UChar c = ruleText.charAt(subStart); 493 subEnd = ruleText.indexOf(c, subStart + 1); 494 // special case for '<%foo<<' 495 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { 496 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle 497 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack 498 // to get around this. Having the duplicate at the front would cause problems with 499 // rules like "<<%" to format, say, percents... 500 ++subEnd; 501 } 502 } 503 504 // if we don't find the end of the token (i.e., if we're on a single, 505 // unmatched token character), create a null substitution positioned 506 // at the end of the rule 507 if (subEnd == -1) { 508 return NULL; 509 } 510 511 // if we get here, we have a real substitution token (or at least 512 // some text bounded by substitution token characters). Use 513 // makeSubstitution() to create the right kind of substitution 514 UnicodeString subToken; 515 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); 516 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, 517 this->formatter, subToken, status); 518 519 // remove the substitution from the rule text 520 ruleText.removeBetween(subStart, subEnd+1); 521 522 return result; 523} 524 525/** 526 * Sets the rule's base value, and causes the radix and exponent 527 * to be recalculated. This is used during construction when we 528 * don't know the rule's base value until after it's been 529 * constructed. It should be used at any other time. 530 * @param The new base value for the rule. 531 */ 532void 533NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) 534{ 535 // set the base value 536 baseValue = newBaseValue; 537 radix = 10; 538 539 // if this isn't a special rule, recalculate the radix and exponent 540 // (the radix always defaults to 10; if it's supposed to be something 541 // else, it's cleaned up by the caller and the exponent is 542 // recalculated again-- the only function that does this is 543 // NFRule.parseRuleDescriptor() ) 544 if (baseValue >= 1) { 545 exponent = expectedExponent(); 546 547 // this function gets called on a fully-constructed rule whose 548 // description didn't specify a base value. This means it 549 // has substitutions, and some substitutions hold on to copies 550 // of the rule's divisor. Fix their copies of the divisor. 551 if (sub1 != NULL) { 552 sub1->setDivisor(radix, exponent, status); 553 } 554 if (sub2 != NULL) { 555 sub2->setDivisor(radix, exponent, status); 556 } 557 558 // if this is a special rule, its radix and exponent are basically 559 // ignored. Set them to "safe" default values 560 } else { 561 exponent = 0; 562 } 563} 564 565/** 566* This calculates the rule's exponent based on its radix and base 567* value. This will be the highest power the radix can be raised to 568* and still produce a result less than or equal to the base value. 569*/ 570int16_t 571NFRule::expectedExponent() const 572{ 573 // since the log of 0, or the log base 0 of something, causes an 574 // error, declare the exponent in these cases to be 0 (we also 575 // deal with the special-rule identifiers here) 576 if (radix == 0 || baseValue < 1) { 577 return 0; 578 } 579 580 // we get rounding error in some cases-- for example, log 1000 / log 10 581 // gives us 1.9999999996 instead of 2. The extra logic here is to take 582 // that into account 583 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); 584 int64_t temp = util64_pow(radix, tempResult + 1); 585 if (temp <= baseValue) { 586 tempResult += 1; 587 } 588 return tempResult; 589} 590 591/** 592 * Searches the rule's rule text for any of the specified strings. 593 * @return The index of the first match in the rule's rule text 594 * (i.e., the first substring in the rule's rule text that matches 595 * _any_ of the strings in "strings"). If none of the strings in 596 * "strings" is found in the rule's rule text, returns -1. 597 */ 598int32_t 599NFRule::indexOfAnyRulePrefix() const 600{ 601 int result = -1; 602 for (int i = 0; RULE_PREFIXES[i]; i++) { 603 int32_t pos = ruleText.indexOf(*RULE_PREFIXES[i]); 604 if (pos != -1 && (result == -1 || pos < result)) { 605 result = pos; 606 } 607 } 608 return result; 609} 610 611//----------------------------------------------------------------------- 612// boilerplate 613//----------------------------------------------------------------------- 614 615static UBool 616util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2) 617{ 618 if (sub1) { 619 if (sub2) { 620 return *sub1 == *sub2; 621 } 622 } else if (!sub2) { 623 return TRUE; 624 } 625 return FALSE; 626} 627 628/** 629* Tests two rules for equality. 630* @param that The rule to compare this one against 631* @return True is the two rules are functionally equivalent 632*/ 633UBool 634NFRule::operator==(const NFRule& rhs) const 635{ 636 return baseValue == rhs.baseValue 637 && radix == rhs.radix 638 && exponent == rhs.exponent 639 && ruleText == rhs.ruleText 640 && util_equalSubstitutions(sub1, rhs.sub1) 641 && util_equalSubstitutions(sub2, rhs.sub2); 642} 643 644/** 645* Returns a textual representation of the rule. This won't 646* necessarily be the same as the description that this rule 647* was created with, but it will produce the same result. 648* @return A textual description of the rule 649*/ 650static void util_append64(UnicodeString& result, int64_t n) 651{ 652 UChar buffer[256]; 653 int32_t len = util64_tou(n, buffer, sizeof(buffer)); 654 UnicodeString temp(buffer, len); 655 result.append(temp); 656} 657 658void 659NFRule::_appendRuleText(UnicodeString& result) const 660{ 661 switch (getType()) { 662 case kNegativeNumberRule: result.append(gMinusX, 2); break; 663 case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break; 664 case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break; 665 case kMasterRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break; 666 case kInfinityRule: result.append(gInf, 3); break; 667 case kNaNRule: result.append(gNaN, 3); break; 668 default: 669 // for a normal rule, write out its base value, and if the radix is 670 // something other than 10, write out the radix (with the preceding 671 // slash, of course). Then calculate the expected exponent and if 672 // if isn't the same as the actual exponent, write an appropriate 673 // number of > signs. Finally, terminate the whole thing with 674 // a colon. 675 util_append64(result, baseValue); 676 if (radix != 10) { 677 result.append(gSlash); 678 util_append64(result, radix); 679 } 680 int numCarets = expectedExponent() - exponent; 681 for (int i = 0; i < numCarets; i++) { 682 result.append(gGreaterThan); 683 } 684 break; 685 } 686 result.append(gColon); 687 result.append(gSpace); 688 689 // if the rule text begins with a space, write an apostrophe 690 // (whitespace after the rule descriptor is ignored; the 691 // apostrophe is used to make the whitespace significant) 692 if (ruleText.charAt(0) == gSpace && (sub1 == NULL || sub1->getPos() != 0)) { 693 result.append(gTick); 694 } 695 696 // now, write the rule's rule text, inserting appropriate 697 // substitution tokens in the appropriate places 698 UnicodeString ruleTextCopy; 699 ruleTextCopy.setTo(ruleText); 700 701 UnicodeString temp; 702 if (sub2 != NULL) { 703 sub2->toString(temp); 704 ruleTextCopy.insert(sub2->getPos(), temp); 705 } 706 if (sub1 != NULL) { 707 sub1->toString(temp); 708 ruleTextCopy.insert(sub1->getPos(), temp); 709 } 710 711 result.append(ruleTextCopy); 712 713 // and finally, top the whole thing off with a semicolon and 714 // return the result 715 result.append(gSemicolon); 716} 717 718//----------------------------------------------------------------------- 719// formatting 720//----------------------------------------------------------------------- 721 722/** 723* Formats the number, and inserts the resulting text into 724* toInsertInto. 725* @param number The number being formatted 726* @param toInsertInto The string where the resultant text should 727* be inserted 728* @param pos The position in toInsertInto where the resultant text 729* should be inserted 730*/ 731void 732NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const 733{ 734 // first, insert the rule's rule text into toInsertInto at the 735 // specified position, then insert the results of the substitutions 736 // into the right places in toInsertInto (notice we do the 737 // substitutions in reverse order so that the offsets don't get 738 // messed up) 739 int32_t pluralRuleStart = ruleText.length(); 740 int32_t lengthOffset = 0; 741 if (!rulePatternFormat) { 742 toInsertInto.insert(pos, ruleText); 743 } 744 else { 745 pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 746 int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart); 747 int initialLength = toInsertInto.length(); 748 if (pluralRuleEnd < ruleText.length() - 1) { 749 toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2)); 750 } 751 toInsertInto.insert(pos, 752 rulePatternFormat->format((int32_t)(number/uprv_pow(radix, exponent)), status)); 753 if (pluralRuleStart > 0) { 754 toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart)); 755 } 756 lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength); 757 } 758 759 if (sub2 != NULL) { 760 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 761 } 762 if (sub1 != NULL) { 763 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 764 } 765} 766 767/** 768* Formats the number, and inserts the resulting text into 769* toInsertInto. 770* @param number The number being formatted 771* @param toInsertInto The string where the resultant text should 772* be inserted 773* @param pos The position in toInsertInto where the resultant text 774* should be inserted 775*/ 776void 777NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const 778{ 779 // first, insert the rule's rule text into toInsertInto at the 780 // specified position, then insert the results of the substitutions 781 // into the right places in toInsertInto 782 // [again, we have two copies of this routine that do the same thing 783 // so that we don't sacrifice precision in a long by casting it 784 // to a double] 785 int32_t pluralRuleStart = ruleText.length(); 786 int32_t lengthOffset = 0; 787 if (!rulePatternFormat) { 788 toInsertInto.insert(pos, ruleText); 789 } 790 else { 791 pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 792 int pluralRuleEnd = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart); 793 int initialLength = toInsertInto.length(); 794 if (pluralRuleEnd < ruleText.length() - 1) { 795 toInsertInto.insert(pos, ruleText.tempSubString(pluralRuleEnd + 2)); 796 } 797 double pluralVal = number; 798 if (0 <= pluralVal && pluralVal < 1) { 799 // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior. 800 // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors. 801 pluralVal = uprv_round(pluralVal * uprv_pow(radix, exponent)); 802 } 803 else { 804 pluralVal = pluralVal / uprv_pow(radix, exponent); 805 } 806 toInsertInto.insert(pos, rulePatternFormat->format((int32_t)(pluralVal), status)); 807 if (pluralRuleStart > 0) { 808 toInsertInto.insert(pos, ruleText.tempSubString(0, pluralRuleStart)); 809 } 810 lengthOffset = ruleText.length() - (toInsertInto.length() - initialLength); 811 } 812 813 if (sub2 != NULL) { 814 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 815 } 816 if (sub1 != NULL) { 817 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 818 } 819} 820 821/** 822* Used by the owning rule set to determine whether to invoke the 823* rollback rule (i.e., whether this rule or the one that precedes 824* it in the rule set's list should be used to format the number) 825* @param The number being formatted 826* @return True if the rule set should use the rule that precedes 827* this one in its list; false if it should use this rule 828*/ 829UBool 830NFRule::shouldRollBack(double number) const 831{ 832 // we roll back if the rule contains a modulus substitution, 833 // the number being formatted is an even multiple of the rule's 834 // divisor, and the rule's base value is NOT an even multiple 835 // of its divisor 836 // In other words, if the original description had 837 // 100: << hundred[ >>]; 838 // that expands into 839 // 100: << hundred; 840 // 101: << hundred >>; 841 // internally. But when we're formatting 200, if we use the rule 842 // at 101, which would normally apply, we get "two hundred zero". 843 // To prevent this, we roll back and use the rule at 100 instead. 844 // This is the logic that makes this happen: the rule at 101 has 845 // a modulus substitution, its base value isn't an even multiple 846 // of 100, and the value we're trying to format _is_ an even 847 // multiple of 100. This is called the "rollback rule." 848 if ((sub1 != NULL && sub1->isModulusSubstitution()) || (sub2 != NULL && sub2->isModulusSubstitution())) { 849 int64_t re = util64_pow(radix, exponent); 850 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; 851 } 852 return FALSE; 853} 854 855//----------------------------------------------------------------------- 856// parsing 857//----------------------------------------------------------------------- 858 859/** 860* Attempts to parse the string with this rule. 861* @param text The string being parsed 862* @param parsePosition On entry, the value is ignored and assumed to 863* be 0. On exit, this has been updated with the position of the first 864* character not consumed by matching the text against this rule 865* (if this rule doesn't match the text at all, the parse position 866* if left unchanged (presumably at 0) and the function returns 867* new Long(0)). 868* @param isFractionRule True if this rule is contained within a 869* fraction rule set. This is only used if the rule has no 870* substitutions. 871* @return If this rule matched the text, this is the rule's base value 872* combined appropriately with the results of parsing the substitutions. 873* If nothing matched, this is new Long(0) and the parse position is 874* left unchanged. The result will be an instance of Long if the 875* result is an integer and Double otherwise. The result is never null. 876*/ 877#ifdef RBNF_DEBUG 878#include <stdio.h> 879 880static void dumpUS(FILE* f, const UnicodeString& us) { 881 int len = us.length(); 882 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; 883 if (buf != NULL) { 884 us.extract(0, len, buf); 885 buf[len] = 0; 886 fprintf(f, "%s", buf); 887 uprv_free(buf); //delete[] buf; 888 } 889} 890#endif 891UBool 892NFRule::doParse(const UnicodeString& text, 893 ParsePosition& parsePosition, 894 UBool isFractionRule, 895 double upperBound, 896 Formattable& resVal) const 897{ 898 // internally we operate on a copy of the string being parsed 899 // (because we're going to change it) and use our own ParsePosition 900 ParsePosition pp; 901 UnicodeString workText(text); 902 903 int32_t sub1Pos = sub1 != NULL ? sub1->getPos() : ruleText.length(); 904 int32_t sub2Pos = sub2 != NULL ? sub2->getPos() : ruleText.length(); 905 906 // check to see whether the text before the first substitution 907 // matches the text at the beginning of the string being 908 // parsed. If it does, strip that off the front of workText; 909 // otherwise, dump out with a mismatch 910 UnicodeString prefix; 911 prefix.setTo(ruleText, 0, sub1Pos); 912 913#ifdef RBNF_DEBUG 914 fprintf(stderr, "doParse %p ", this); 915 { 916 UnicodeString rt; 917 _appendRuleText(rt); 918 dumpUS(stderr, rt); 919 } 920 921 fprintf(stderr, " text: '"); 922 dumpUS(stderr, text); 923 fprintf(stderr, "' prefix: '"); 924 dumpUS(stderr, prefix); 925#endif 926 stripPrefix(workText, prefix, pp); 927 int32_t prefixLength = text.length() - workText.length(); 928 929#ifdef RBNF_DEBUG 930 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos); 931#endif 932 933 if (pp.getIndex() == 0 && sub1Pos != 0) { 934 // commented out because ParsePosition doesn't have error index in 1.1.x 935 // restored for ICU4C port 936 parsePosition.setErrorIndex(pp.getErrorIndex()); 937 resVal.setLong(0); 938 return TRUE; 939 } 940 if (baseValue == kInfinityRule) { 941 // If you match this, don't try to perform any calculations on it. 942 parsePosition.setIndex(pp.getIndex()); 943 resVal.setDouble(uprv_getInfinity()); 944 return TRUE; 945 } 946 if (baseValue == kNaNRule) { 947 // If you match this, don't try to perform any calculations on it. 948 parsePosition.setIndex(pp.getIndex()); 949 resVal.setDouble(uprv_getNaN()); 950 return TRUE; 951 } 952 953 // this is the fun part. The basic guts of the rule-matching 954 // logic is matchToDelimiter(), which is called twice. The first 955 // time it searches the input string for the rule text BETWEEN 956 // the substitutions and tries to match the intervening text 957 // in the input string with the first substitution. If that 958 // succeeds, it then calls it again, this time to look for the 959 // rule text after the second substitution and to match the 960 // intervening input text against the second substitution. 961 // 962 // For example, say we have a rule that looks like this: 963 // first << middle >> last; 964 // and input text that looks like this: 965 // first one middle two last 966 // First we use stripPrefix() to match "first " in both places and 967 // strip it off the front, leaving 968 // one middle two last 969 // Then we use matchToDelimiter() to match " middle " and try to 970 // match "one" against a substitution. If it's successful, we now 971 // have 972 // two last 973 // We use matchToDelimiter() a second time to match " last" and 974 // try to match "two" against a substitution. If "two" matches 975 // the substitution, we have a successful parse. 976 // 977 // Since it's possible in many cases to find multiple instances 978 // of each of these pieces of rule text in the input string, 979 // we need to try all the possible combinations of these 980 // locations. This prevents us from prematurely declaring a mismatch, 981 // and makes sure we match as much input text as we can. 982 int highWaterMark = 0; 983 double result = 0; 984 int start = 0; 985 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); 986 987 UnicodeString temp; 988 do { 989 // our partial parse result starts out as this rule's base 990 // value. If it finds a successful match, matchToDelimiter() 991 // will compose this in some way with what it gets back from 992 // the substitution, giving us a new partial parse result 993 pp.setIndex(0); 994 995 temp.setTo(ruleText, sub1Pos, sub2Pos - sub1Pos); 996 double partialResult = matchToDelimiter(workText, start, tempBaseValue, 997 temp, pp, sub1, 998 upperBound); 999 1000 // if we got a successful match (or were trying to match a 1001 // null substitution), pp is now pointing at the first unmatched 1002 // character. Take note of that, and try matchToDelimiter() 1003 // on the input text again 1004 if (pp.getIndex() != 0 || sub1 == NULL) { 1005 start = pp.getIndex(); 1006 1007 UnicodeString workText2; 1008 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); 1009 ParsePosition pp2; 1010 1011 // the second matchToDelimiter() will compose our previous 1012 // partial result with whatever it gets back from its 1013 // substitution if there's a successful match, giving us 1014 // a real result 1015 temp.setTo(ruleText, sub2Pos, ruleText.length() - sub2Pos); 1016 partialResult = matchToDelimiter(workText2, 0, partialResult, 1017 temp, pp2, sub2, 1018 upperBound); 1019 1020 // if we got a successful match on this second 1021 // matchToDelimiter() call, update the high-water mark 1022 // and result (if necessary) 1023 if (pp2.getIndex() != 0 || sub2 == NULL) { 1024 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { 1025 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); 1026 result = partialResult; 1027 } 1028 } 1029 else { 1030 // commented out because ParsePosition doesn't have error index in 1.1.x 1031 // restored for ICU4C port 1032 int32_t temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex(); 1033 if (temp> parsePosition.getErrorIndex()) { 1034 parsePosition.setErrorIndex(temp); 1035 } 1036 } 1037 } 1038 else { 1039 // commented out because ParsePosition doesn't have error index in 1.1.x 1040 // restored for ICU4C port 1041 int32_t temp = sub1Pos + pp.getErrorIndex(); 1042 if (temp > parsePosition.getErrorIndex()) { 1043 parsePosition.setErrorIndex(temp); 1044 } 1045 } 1046 // keep trying to match things until the outer matchToDelimiter() 1047 // call fails to make a match (each time, it picks up where it 1048 // left off the previous time) 1049 } while (sub1Pos != sub2Pos 1050 && pp.getIndex() > 0 1051 && pp.getIndex() < workText.length() 1052 && pp.getIndex() != start); 1053 1054 // update the caller's ParsePosition with our high-water mark 1055 // (i.e., it now points at the first character this function 1056 // didn't match-- the ParsePosition is therefore unchanged if 1057 // we didn't match anything) 1058 parsePosition.setIndex(highWaterMark); 1059 // commented out because ParsePosition doesn't have error index in 1.1.x 1060 // restored for ICU4C port 1061 if (highWaterMark > 0) { 1062 parsePosition.setErrorIndex(0); 1063 } 1064 1065 // this is a hack for one unusual condition: Normally, whether this 1066 // rule belong to a fraction rule set or not is handled by its 1067 // substitutions. But if that rule HAS NO substitutions, then 1068 // we have to account for it here. By definition, if the matching 1069 // rule in a fraction rule set has no substitutions, its numerator 1070 // is 1, and so the result is the reciprocal of its base value. 1071 if (isFractionRule && highWaterMark > 0 && sub1 == NULL) { 1072 result = 1 / result; 1073 } 1074 1075 resVal.setDouble(result); 1076 return TRUE; // ??? do we need to worry if it is a long or a double? 1077} 1078 1079/** 1080* This function is used by parse() to match the text being parsed 1081* against a possible prefix string. This function 1082* matches characters from the beginning of the string being parsed 1083* to characters from the prospective prefix. If they match, pp is 1084* updated to the first character not matched, and the result is 1085* the unparsed part of the string. If they don't match, the whole 1086* string is returned, and pp is left unchanged. 1087* @param text The string being parsed 1088* @param prefix The text to match against 1089* @param pp On entry, ignored and assumed to be 0. On exit, points 1090* to the first unmatched character (assuming the whole prefix matched), 1091* or is unchanged (if the whole prefix didn't match). 1092* @return If things match, this is the unparsed part of "text"; 1093* if they didn't match, this is "text". 1094*/ 1095void 1096NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const 1097{ 1098 // if the prefix text is empty, dump out without doing anything 1099 if (prefix.length() != 0) { 1100 UErrorCode status = U_ZERO_ERROR; 1101 // use prefixLength() to match the beginning of 1102 // "text" against "prefix". This function returns the 1103 // number of characters from "text" that matched (or 0 if 1104 // we didn't match the whole prefix) 1105 int32_t pfl = prefixLength(text, prefix, status); 1106 if (U_FAILURE(status)) { // Memory allocation error. 1107 return; 1108 } 1109 if (pfl != 0) { 1110 // if we got a successful match, update the parse position 1111 // and strip the prefix off of "text" 1112 pp.setIndex(pp.getIndex() + pfl); 1113 text.remove(0, pfl); 1114 } 1115 } 1116} 1117 1118/** 1119* Used by parse() to match a substitution and any following text. 1120* "text" is searched for instances of "delimiter". For each instance 1121* of delimiter, the intervening text is tested to see whether it 1122* matches the substitution. The longest match wins. 1123* @param text The string being parsed 1124* @param startPos The position in "text" where we should start looking 1125* for "delimiter". 1126* @param baseValue A partial parse result (often the rule's base value), 1127* which is combined with the result from matching the substitution 1128* @param delimiter The string to search "text" for. 1129* @param pp Ignored and presumed to be 0 on entry. If there's a match, 1130* on exit this will point to the first unmatched character. 1131* @param sub If we find "delimiter" in "text", this substitution is used 1132* to match the text between the beginning of the string and the 1133* position of "delimiter." (If "delimiter" is the empty string, then 1134* this function just matches against this substitution and updates 1135* everything accordingly.) 1136* @param upperBound When matching the substitution, it will only 1137* consider rules with base values lower than this value. 1138* @return If there's a match, this is the result of composing 1139* baseValue with the result of matching the substitution. Otherwise, 1140* this is new Long(0). It's never null. If the result is an integer, 1141* this will be an instance of Long; otherwise, it's an instance of 1142* Double. 1143* 1144* !!! note {dlf} in point of fact, in the java code the caller always converts 1145* the result to a double, so we might as well return one. 1146*/ 1147double 1148NFRule::matchToDelimiter(const UnicodeString& text, 1149 int32_t startPos, 1150 double _baseValue, 1151 const UnicodeString& delimiter, 1152 ParsePosition& pp, 1153 const NFSubstitution* sub, 1154 double upperBound) const 1155{ 1156 UErrorCode status = U_ZERO_ERROR; 1157 // if "delimiter" contains real (i.e., non-ignorable) text, search 1158 // it for "delimiter" beginning at "start". If that succeeds, then 1159 // use "sub"'s doParse() method to match the text before the 1160 // instance of "delimiter" we just found. 1161 if (!allIgnorable(delimiter, status)) { 1162 if (U_FAILURE(status)) { //Memory allocation error. 1163 return 0; 1164 } 1165 ParsePosition tempPP; 1166 Formattable result; 1167 1168 // use findText() to search for "delimiter". It returns a two- 1169 // element array: element 0 is the position of the match, and 1170 // element 1 is the number of characters that matched 1171 // "delimiter". 1172 int32_t dLen; 1173 int32_t dPos = findText(text, delimiter, startPos, &dLen); 1174 1175 // if findText() succeeded, isolate the text preceding the 1176 // match, and use "sub" to match that text 1177 while (dPos >= 0) { 1178 UnicodeString subText; 1179 subText.setTo(text, 0, dPos); 1180 if (subText.length() > 0) { 1181 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, 1182#if UCONFIG_NO_COLLATION 1183 FALSE, 1184#else 1185 formatter->isLenient(), 1186#endif 1187 result); 1188 1189 // if the substitution could match all the text up to 1190 // where we found "delimiter", then this function has 1191 // a successful match. Bump the caller's parse position 1192 // to point to the first character after the text 1193 // that matches "delimiter", and return the result 1194 // we got from parsing the substitution. 1195 if (success && tempPP.getIndex() == dPos) { 1196 pp.setIndex(dPos + dLen); 1197 return result.getDouble(); 1198 } 1199 else { 1200 // commented out because ParsePosition doesn't have error index in 1.1.x 1201 // restored for ICU4C port 1202 if (tempPP.getErrorIndex() > 0) { 1203 pp.setErrorIndex(tempPP.getErrorIndex()); 1204 } else { 1205 pp.setErrorIndex(tempPP.getIndex()); 1206 } 1207 } 1208 } 1209 1210 // if we didn't match the substitution, search for another 1211 // copy of "delimiter" in "text" and repeat the loop if 1212 // we find it 1213 tempPP.setIndex(0); 1214 dPos = findText(text, delimiter, dPos + dLen, &dLen); 1215 } 1216 // if we make it here, this was an unsuccessful match, and we 1217 // leave pp unchanged and return 0 1218 pp.setIndex(0); 1219 return 0; 1220 1221 // if "delimiter" is empty, or consists only of ignorable characters 1222 // (i.e., is semantically empty), thwe we obviously can't search 1223 // for "delimiter". Instead, just use "sub" to parse as much of 1224 // "text" as possible. 1225 } 1226 else if (sub == NULL) { 1227 return _baseValue; 1228 } 1229 else { 1230 ParsePosition tempPP; 1231 Formattable result; 1232 1233 // try to match the whole string against the substitution 1234 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, 1235#if UCONFIG_NO_COLLATION 1236 FALSE, 1237#else 1238 formatter->isLenient(), 1239#endif 1240 result); 1241 if (success && (tempPP.getIndex() != 0)) { 1242 // if there's a successful match (or it's a null 1243 // substitution), update pp to point to the first 1244 // character we didn't match, and pass the result from 1245 // sub.doParse() on through to the caller 1246 pp.setIndex(tempPP.getIndex()); 1247 return result.getDouble(); 1248 } 1249 else { 1250 // commented out because ParsePosition doesn't have error index in 1.1.x 1251 // restored for ICU4C port 1252 pp.setErrorIndex(tempPP.getErrorIndex()); 1253 } 1254 1255 // and if we get to here, then nothing matched, so we return 1256 // 0 and leave pp alone 1257 return 0; 1258 } 1259} 1260 1261/** 1262* Used by stripPrefix() to match characters. If lenient parse mode 1263* is off, this just calls startsWith(). If lenient parse mode is on, 1264* this function uses CollationElementIterators to match characters in 1265* the strings (only primary-order differences are significant in 1266* determining whether there's a match). 1267* @param str The string being tested 1268* @param prefix The text we're hoping to see at the beginning 1269* of "str" 1270* @return If "prefix" is found at the beginning of "str", this 1271* is the number of characters in "str" that were matched (this 1272* isn't necessarily the same as the length of "prefix" when matching 1273* text with a collator). If there's no match, this is 0. 1274*/ 1275int32_t 1276NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const 1277{ 1278 // if we're looking for an empty prefix, it obviously matches 1279 // zero characters. Just go ahead and return 0. 1280 if (prefix.length() == 0) { 1281 return 0; 1282 } 1283 1284#if !UCONFIG_NO_COLLATION 1285 // go through all this grief if we're in lenient-parse mode 1286 if (formatter->isLenient()) { 1287 // get the formatter's collator and use it to create two 1288 // collation element iterators, one over the target string 1289 // and another over the prefix (right now, we'll throw an 1290 // exception if the collator we get back from the formatter 1291 // isn't a RuleBasedCollator, because RuleBasedCollator defines 1292 // the CollationElementIterator protocol. Hopefully, this 1293 // will change someday.) 1294 const RuleBasedCollator* collator = formatter->getCollator(); 1295 if (collator == NULL) { 1296 status = U_MEMORY_ALLOCATION_ERROR; 1297 return 0; 1298 } 1299 LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str)); 1300 LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix)); 1301 // Check for memory allocation error. 1302 if (strIter.isNull() || prefixIter.isNull()) { 1303 status = U_MEMORY_ALLOCATION_ERROR; 1304 return 0; 1305 } 1306 1307 UErrorCode err = U_ZERO_ERROR; 1308 1309 // The original code was problematic. Consider this match: 1310 // prefix = "fifty-" 1311 // string = " fifty-7" 1312 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 1313 // in the string. Unfortunately, we were getting a match, and then computing where 1314 // the match terminated by rematching the string. The rematch code was using as an 1315 // initial guess the substring of string between 0 and prefix.length. Because of 1316 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving 1317 // the position before the hyphen in the string. Recursing down, we then parsed the 1318 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). 1319 // This was not pretty, especially since the string "fifty-7" parsed just fine. 1320 // 1321 // We have newer APIs now, so we can use calls on the iterator to determine what we 1322 // matched up to. If we terminate because we hit the last element in the string, 1323 // our match terminates at this length. If we terminate because we hit the last element 1324 // in the target, our match terminates at one before the element iterator position. 1325 1326 // match collation elements between the strings 1327 int32_t oStr = strIter->next(err); 1328 int32_t oPrefix = prefixIter->next(err); 1329 1330 while (oPrefix != CollationElementIterator::NULLORDER) { 1331 // skip over ignorable characters in the target string 1332 while (CollationElementIterator::primaryOrder(oStr) == 0 1333 && oStr != CollationElementIterator::NULLORDER) { 1334 oStr = strIter->next(err); 1335 } 1336 1337 // skip over ignorable characters in the prefix 1338 while (CollationElementIterator::primaryOrder(oPrefix) == 0 1339 && oPrefix != CollationElementIterator::NULLORDER) { 1340 oPrefix = prefixIter->next(err); 1341 } 1342 1343 // dlf: move this above following test, if we consume the 1344 // entire target, aren't we ok even if the source was also 1345 // entirely consumed? 1346 1347 // if skipping over ignorables brought to the end of 1348 // the prefix, we DID match: drop out of the loop 1349 if (oPrefix == CollationElementIterator::NULLORDER) { 1350 break; 1351 } 1352 1353 // if skipping over ignorables brought us to the end 1354 // of the target string, we didn't match and return 0 1355 if (oStr == CollationElementIterator::NULLORDER) { 1356 return 0; 1357 } 1358 1359 // match collation elements from the two strings 1360 // (considering only primary differences). If we 1361 // get a mismatch, dump out and return 0 1362 if (CollationElementIterator::primaryOrder(oStr) 1363 != CollationElementIterator::primaryOrder(oPrefix)) { 1364 return 0; 1365 1366 // otherwise, advance to the next character in each string 1367 // and loop (we drop out of the loop when we exhaust 1368 // collation elements in the prefix) 1369 } else { 1370 oStr = strIter->next(err); 1371 oPrefix = prefixIter->next(err); 1372 } 1373 } 1374 1375 int32_t result = strIter->getOffset(); 1376 if (oStr != CollationElementIterator::NULLORDER) { 1377 --result; // back over character that we don't want to consume; 1378 } 1379 1380#ifdef RBNF_DEBUG 1381 fprintf(stderr, "prefix length: %d\n", result); 1382#endif 1383 return result; 1384#if 0 1385 //---------------------------------------------------------------- 1386 // JDK 1.2-specific API call 1387 // return strIter.getOffset(); 1388 //---------------------------------------------------------------- 1389 // JDK 1.1 HACK (take out for 1.2-specific code) 1390 1391 // if we make it to here, we have a successful match. Now we 1392 // have to find out HOW MANY characters from the target string 1393 // matched the prefix (there isn't necessarily a one-to-one 1394 // mapping between collation elements and characters). 1395 // In JDK 1.2, there's a simple getOffset() call we can use. 1396 // In JDK 1.1, on the other hand, we have to go through some 1397 // ugly contortions. First, use the collator to compare the 1398 // same number of characters from the prefix and target string. 1399 // If they're equal, we're done. 1400 collator->setStrength(Collator::PRIMARY); 1401 if (str.length() >= prefix.length()) { 1402 UnicodeString temp; 1403 temp.setTo(str, 0, prefix.length()); 1404 if (collator->equals(temp, prefix)) { 1405#ifdef RBNF_DEBUG 1406 fprintf(stderr, "returning: %d\n", prefix.length()); 1407#endif 1408 return prefix.length(); 1409 } 1410 } 1411 1412 // if they're not equal, then we have to compare successively 1413 // larger and larger substrings of the target string until we 1414 // get to one that matches the prefix. At that point, we know 1415 // how many characters matched the prefix, and we can return. 1416 int32_t p = 1; 1417 while (p <= str.length()) { 1418 UnicodeString temp; 1419 temp.setTo(str, 0, p); 1420 if (collator->equals(temp, prefix)) { 1421 return p; 1422 } else { 1423 ++p; 1424 } 1425 } 1426 1427 // SHOULD NEVER GET HERE!!! 1428 return 0; 1429 //---------------------------------------------------------------- 1430#endif 1431 1432 // If lenient parsing is turned off, forget all that crap above. 1433 // Just use String.startsWith() and be done with it. 1434 } else 1435#endif 1436 { 1437 if (str.startsWith(prefix)) { 1438 return prefix.length(); 1439 } else { 1440 return 0; 1441 } 1442 } 1443} 1444 1445/** 1446* Searches a string for another string. If lenient parsing is off, 1447* this just calls indexOf(). If lenient parsing is on, this function 1448* uses CollationElementIterator to match characters, and only 1449* primary-order differences are significant in determining whether 1450* there's a match. 1451* @param str The string to search 1452* @param key The string to search "str" for 1453* @param startingAt The index into "str" where the search is to 1454* begin 1455* @return A two-element array of ints. Element 0 is the position 1456* of the match, or -1 if there was no match. Element 1 is the 1457* number of characters in "str" that matched (which isn't necessarily 1458* the same as the length of "key") 1459*/ 1460int32_t 1461NFRule::findText(const UnicodeString& str, 1462 const UnicodeString& key, 1463 int32_t startingAt, 1464 int32_t* length) const 1465{ 1466 if (rulePatternFormat) { 1467 Formattable result; 1468 FieldPosition position(UNUM_INTEGER_FIELD); 1469 position.setBeginIndex(startingAt); 1470 rulePatternFormat->parseType(str, this, result, position); 1471 int start = position.getBeginIndex(); 1472 if (start >= 0) { 1473 int32_t pluralRuleStart = ruleText.indexOf(gDollarOpenParenthesis, -1, 0); 1474 int32_t pluralRuleSuffix = ruleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2; 1475 int32_t matchLen = position.getEndIndex() - start; 1476 UnicodeString prefix(ruleText.tempSubString(0, pluralRuleStart)); 1477 UnicodeString suffix(ruleText.tempSubString(pluralRuleSuffix)); 1478 if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0 1479 && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0) 1480 { 1481 *length = matchLen + prefix.length() + suffix.length(); 1482 return start - prefix.length(); 1483 } 1484 } 1485 *length = 0; 1486 return -1; 1487 } 1488 if (!formatter->isLenient()) { 1489 // if lenient parsing is turned off, this is easy: just call 1490 // String.indexOf() and we're done 1491 *length = key.length(); 1492 return str.indexOf(key, startingAt); 1493 } 1494 else { 1495 // but if lenient parsing is turned ON, we've got some work 1496 // ahead of us 1497 return findTextLenient(str, key, startingAt, length); 1498 } 1499} 1500 1501int32_t 1502NFRule::findTextLenient(const UnicodeString& str, 1503 const UnicodeString& key, 1504 int32_t startingAt, 1505 int32_t* length) const 1506{ 1507 //---------------------------------------------------------------- 1508 // JDK 1.1 HACK (take out of 1.2-specific code) 1509 1510 // in JDK 1.2, CollationElementIterator provides us with an 1511 // API to map between character offsets and collation elements 1512 // and we can do this by marching through the string comparing 1513 // collation elements. We can't do that in JDK 1.1. Insted, 1514 // we have to go through this horrible slow mess: 1515 int32_t p = startingAt; 1516 int32_t keyLen = 0; 1517 1518 // basically just isolate smaller and smaller substrings of 1519 // the target string (each running to the end of the string, 1520 // and with the first one running from startingAt to the end) 1521 // and then use prefixLength() to see if the search key is at 1522 // the beginning of each substring. This is excruciatingly 1523 // slow, but it will locate the key and tell use how long the 1524 // matching text was. 1525 UnicodeString temp; 1526 UErrorCode status = U_ZERO_ERROR; 1527 while (p < str.length() && keyLen == 0) { 1528 temp.setTo(str, p, str.length() - p); 1529 keyLen = prefixLength(temp, key, status); 1530 if (U_FAILURE(status)) { 1531 break; 1532 } 1533 if (keyLen != 0) { 1534 *length = keyLen; 1535 return p; 1536 } 1537 ++p; 1538 } 1539 // if we make it to here, we didn't find it. Return -1 for the 1540 // location. The length should be ignored, but set it to 0, 1541 // which should be "safe" 1542 *length = 0; 1543 return -1; 1544} 1545 1546/** 1547* Checks to see whether a string consists entirely of ignorable 1548* characters. 1549* @param str The string to test. 1550* @return true if the string is empty of consists entirely of 1551* characters that the number formatter's collator says are 1552* ignorable at the primary-order level. false otherwise. 1553*/ 1554UBool 1555NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const 1556{ 1557 // if the string is empty, we can just return true 1558 if (str.length() == 0) { 1559 return TRUE; 1560 } 1561 1562#if !UCONFIG_NO_COLLATION 1563 // if lenient parsing is turned on, walk through the string with 1564 // a collation element iterator and make sure each collation 1565 // element is 0 (ignorable) at the primary level 1566 if (formatter->isLenient()) { 1567 const RuleBasedCollator* collator = formatter->getCollator(); 1568 if (collator == NULL) { 1569 status = U_MEMORY_ALLOCATION_ERROR; 1570 return FALSE; 1571 } 1572 LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str)); 1573 1574 // Memory allocation error check. 1575 if (iter.isNull()) { 1576 status = U_MEMORY_ALLOCATION_ERROR; 1577 return FALSE; 1578 } 1579 1580 UErrorCode err = U_ZERO_ERROR; 1581 int32_t o = iter->next(err); 1582 while (o != CollationElementIterator::NULLORDER 1583 && CollationElementIterator::primaryOrder(o) == 0) { 1584 o = iter->next(err); 1585 } 1586 1587 return o == CollationElementIterator::NULLORDER; 1588 } 1589#endif 1590 1591 // if lenient parsing is turned off, there is no such thing as 1592 // an ignorable character: return true only if the string is empty 1593 return FALSE; 1594} 1595 1596void 1597NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) { 1598 if (sub1 != NULL) { 1599 sub1->setDecimalFormatSymbols(newSymbols, status); 1600 } 1601 if (sub2 != NULL) { 1602 sub2->setDecimalFormatSymbols(newSymbols, status); 1603 } 1604} 1605 1606U_NAMESPACE_END 1607 1608/* U_HAVE_RBNF */ 1609#endif 1610