1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 * Copyright (C) 2015, International Business Machines
5 * Corporation and others.  All Rights Reserved.
6 *
7 * file name: affixpatternparser.cpp
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_FORMATTING
13
14#include "unicode/dcfmtsym.h"
15#include "unicode/plurrule.h"
16#include "unicode/ucurr.h"
17#include "affixpatternparser.h"
18#include "charstr.h"
19#include "precision.h"
20#include "uassert.h"
21#include "unistrappender.h"
22
23        static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
24
25static UChar gPercent = 0x25;
26static UChar gPerMill = 0x2030;
27static UChar gNegative = 0x2D;
28static UChar gPositive = 0x2B;
29
30#define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
31
32#define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
33
34#define UNPACK_LONG(c) (((c) >> 8) & 0x80)
35
36#define UNPACK_LENGTH(c) ((c) & 0xFF)
37
38U_NAMESPACE_BEGIN
39
40static int32_t
41nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
42    if (buffer[idx] != 0x27 || idx + 1 == len) {
43        *token = buffer[idx];
44        return 1;
45    }
46    *token = buffer[idx + 1];
47    if (buffer[idx + 1] == 0xA4) {
48        int32_t i = 2;
49        for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
50          ;
51        return i;
52    }
53    return 2;
54}
55
56static int32_t
57nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
58    *token = buffer[idx];
59    int32_t max;
60    switch (buffer[idx]) {
61    case 0x27:
62        max = 2;
63        break;
64    case 0xA4:
65        max = 3;
66        break;
67    default:
68        max = 1;
69        break;
70    }
71    int32_t i = 1;
72    for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
73      ;
74    return i;
75}
76
77CurrencyAffixInfo::CurrencyAffixInfo()
78        : fSymbol(gDefaultSymbols, 1),
79          fISO(gDefaultSymbols, 2),
80          fLong(DigitAffix(gDefaultSymbols, 3)),
81          fIsDefault(TRUE) {
82}
83
84void
85CurrencyAffixInfo::set(
86        const char *locale,
87        const PluralRules *rules,
88        const UChar *currency,
89        UErrorCode &status) {
90    if (U_FAILURE(status)) {
91        return;
92    }
93    fIsDefault = FALSE;
94    if (currency == NULL) {
95        fSymbol.setTo(gDefaultSymbols, 1);
96        fISO.setTo(gDefaultSymbols, 2);
97        fLong.remove();
98        fLong.append(gDefaultSymbols, 3);
99        fIsDefault = TRUE;
100        return;
101    }
102    int32_t len;
103    UBool unusedIsChoice;
104    const UChar *symbol = ucurr_getName(
105            currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
106            &len, &status);
107    if (U_FAILURE(status)) {
108        return;
109    }
110    fSymbol.setTo(symbol, len);
111    fISO.setTo(currency, u_strlen(currency));
112    fLong.remove();
113    StringEnumeration* keywords = rules->getKeywords(status);
114    if (U_FAILURE(status)) {
115        return;
116    }
117    const UnicodeString* pluralCount;
118    while ((pluralCount = keywords->snext(status)) != NULL) {
119        CharString pCount;
120        pCount.appendInvariantChars(*pluralCount, status);
121        const UChar *pluralName = ucurr_getPluralName(
122            currency, locale, &unusedIsChoice, pCount.data(),
123            &len, &status);
124        fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
125    }
126    delete keywords;
127}
128
129void
130CurrencyAffixInfo::adjustPrecision(
131        const UChar *currency, const UCurrencyUsage usage,
132        FixedPrecision &precision, UErrorCode &status) {
133    if (U_FAILURE(status)) {
134        return;
135    }
136
137    int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
138            currency, usage, &status);
139    precision.fMin.setFracDigitCount(digitCount);
140    precision.fMax.setFracDigitCount(digitCount);
141    double increment = ucurr_getRoundingIncrementForUsage(
142            currency, usage, &status);
143    if (increment == 0.0) {
144        precision.fRoundingIncrement.clear();
145    } else {
146        precision.fRoundingIncrement.set(increment);
147        // guard against round-off error
148        precision.fRoundingIncrement.round(6);
149    }
150}
151
152void
153AffixPattern::addLiteral(
154        const UChar *literal, int32_t start, int32_t len) {
155    char32Count += u_countChar32(literal + start, len);
156    literals.append(literal, start, len);
157    int32_t tlen = tokens.length();
158    // Takes 4 UChars to encode maximum literal length.
159    UChar *tokenChars = tokens.getBuffer(tlen + 4);
160
161    // find start of literal size. May be tlen if there is no literal.
162    // While finding start of literal size, compute literal length
163    int32_t literalLength = 0;
164    int32_t tLiteralStart = tlen;
165    while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
166        tLiteralStart--;
167        literalLength <<= 8;
168        literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
169    }
170    // Add number of chars we just added to literal
171    literalLength += len;
172
173    // Now encode the new length starting at tLiteralStart
174    tlen = tLiteralStart;
175    tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
176    literalLength >>= 8;
177    while (literalLength) {
178        tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
179        literalLength >>= 8;
180    }
181    tokens.releaseBuffer(tlen);
182}
183
184void
185AffixPattern::add(ETokenType t) {
186    add(t, 1);
187}
188
189void
190AffixPattern::addCurrency(uint8_t count) {
191    add(kCurrency, count);
192}
193
194void
195AffixPattern::add(ETokenType t, uint8_t count) {
196    U_ASSERT(t != kLiteral);
197    char32Count += count;
198    switch (t) {
199    case kCurrency:
200        hasCurrencyToken = TRUE;
201        break;
202    case kPercent:
203        hasPercentToken = TRUE;
204        break;
205    case kPerMill:
206        hasPermillToken = TRUE;
207        break;
208    default:
209        // Do nothing
210        break;
211    }
212    tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
213}
214
215AffixPattern &
216AffixPattern::append(const AffixPattern &other) {
217    AffixPatternIterator iter;
218    other.iterator(iter);
219    UnicodeString literal;
220    while (iter.nextToken()) {
221        switch (iter.getTokenType()) {
222        case kLiteral:
223            iter.getLiteral(literal);
224            addLiteral(literal.getBuffer(), 0, literal.length());
225            break;
226        case kCurrency:
227            addCurrency(iter.getTokenLength());
228            break;
229        default:
230            add(iter.getTokenType());
231            break;
232        }
233    }
234    return *this;
235}
236
237void
238AffixPattern::remove() {
239    tokens.remove();
240    literals.remove();
241    hasCurrencyToken = FALSE;
242    hasPercentToken = FALSE;
243    hasPermillToken = FALSE;
244    char32Count = 0;
245}
246
247// escapes literals for strings where special characters are NOT escaped
248// except for apostrophe.
249static void escapeApostropheInLiteral(
250        const UnicodeString &literal, UnicodeStringAppender &appender) {
251    int32_t len = literal.length();
252    const UChar *buffer = literal.getBuffer();
253    for (int32_t i = 0; i < len; ++i) {
254        UChar ch = buffer[i];
255        switch (ch) {
256            case 0x27:
257                appender.append((UChar) 0x27);
258                appender.append((UChar) 0x27);
259                break;
260            default:
261                appender.append(ch);
262                break;
263        }
264    }
265}
266
267
268// escapes literals for user strings where special characters in literals
269// are escaped with apostrophe.
270static void escapeLiteral(
271        const UnicodeString &literal, UnicodeStringAppender &appender) {
272    int32_t len = literal.length();
273    const UChar *buffer = literal.getBuffer();
274    for (int32_t i = 0; i < len; ++i) {
275        UChar ch = buffer[i];
276        switch (ch) {
277            case 0x27:
278                appender.append((UChar) 0x27);
279                appender.append((UChar) 0x27);
280                break;
281            case 0x25:
282                appender.append((UChar) 0x27);
283                appender.append((UChar) 0x25);
284                appender.append((UChar) 0x27);
285                break;
286            case 0x2030:
287                appender.append((UChar) 0x27);
288                appender.append((UChar) 0x2030);
289                appender.append((UChar) 0x27);
290                break;
291            case 0xA4:
292                appender.append((UChar) 0x27);
293                appender.append((UChar) 0xA4);
294                appender.append((UChar) 0x27);
295                break;
296            case 0x2D:
297                appender.append((UChar) 0x27);
298                appender.append((UChar) 0x2D);
299                appender.append((UChar) 0x27);
300                break;
301            case 0x2B:
302                appender.append((UChar) 0x27);
303                appender.append((UChar) 0x2B);
304                appender.append((UChar) 0x27);
305                break;
306            default:
307                appender.append(ch);
308                break;
309        }
310    }
311}
312
313UnicodeString &
314AffixPattern::toString(UnicodeString &appendTo) const {
315    AffixPatternIterator iter;
316    iterator(iter);
317    UnicodeStringAppender appender(appendTo);
318    UnicodeString literal;
319    while (iter.nextToken()) {
320        switch (iter.getTokenType()) {
321        case kLiteral:
322            escapeApostropheInLiteral(iter.getLiteral(literal), appender);
323            break;
324        case kPercent:
325            appender.append((UChar) 0x27);
326            appender.append((UChar) 0x25);
327            break;
328        case kPerMill:
329            appender.append((UChar) 0x27);
330            appender.append((UChar) 0x2030);
331            break;
332        case kCurrency:
333            {
334                appender.append((UChar) 0x27);
335                int32_t cl = iter.getTokenLength();
336                for (int32_t i = 0; i < cl; ++i) {
337                    appender.append((UChar) 0xA4);
338                }
339            }
340            break;
341        case kNegative:
342            appender.append((UChar) 0x27);
343            appender.append((UChar) 0x2D);
344            break;
345        case kPositive:
346            appender.append((UChar) 0x27);
347            appender.append((UChar) 0x2B);
348            break;
349        default:
350            U_ASSERT(FALSE);
351            break;
352        }
353    }
354    return appendTo;
355}
356
357UnicodeString &
358AffixPattern::toUserString(UnicodeString &appendTo) const {
359    AffixPatternIterator iter;
360    iterator(iter);
361    UnicodeStringAppender appender(appendTo);
362    UnicodeString literal;
363    while (iter.nextToken()) {
364        switch (iter.getTokenType()) {
365        case kLiteral:
366            escapeLiteral(iter.getLiteral(literal), appender);
367            break;
368        case kPercent:
369            appender.append((UChar) 0x25);
370            break;
371        case kPerMill:
372            appender.append((UChar) 0x2030);
373            break;
374        case kCurrency:
375            {
376                int32_t cl = iter.getTokenLength();
377                for (int32_t i = 0; i < cl; ++i) {
378                    appender.append((UChar) 0xA4);
379                }
380            }
381            break;
382        case kNegative:
383            appender.append((UChar) 0x2D);
384            break;
385        case kPositive:
386            appender.append((UChar) 0x2B);
387            break;
388        default:
389            U_ASSERT(FALSE);
390            break;
391        }
392    }
393    return appendTo;
394}
395
396class AffixPatternAppender : public UMemory {
397public:
398    AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
399
400    inline void append(UChar x) {
401        if (fIdx == UPRV_LENGTHOF(fBuffer)) {
402            fDest->addLiteral(fBuffer, 0, fIdx);
403            fIdx = 0;
404        }
405        fBuffer[fIdx++] = x;
406    }
407
408    inline void append(UChar32 x) {
409        if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
410            fDest->addLiteral(fBuffer, 0, fIdx);
411            fIdx = 0;
412        }
413        U16_APPEND_UNSAFE(fBuffer, fIdx, x);
414    }
415
416    inline void flush() {
417        if (fIdx) {
418            fDest->addLiteral(fBuffer, 0, fIdx);
419        }
420        fIdx = 0;
421    }
422
423    /**
424     * flush the buffer when we go out of scope.
425     */
426    ~AffixPatternAppender() {
427        flush();
428    }
429private:
430    AffixPattern *fDest;
431    int32_t fIdx;
432    UChar fBuffer[32];
433    AffixPatternAppender(const AffixPatternAppender &other);
434    AffixPatternAppender &operator=(const AffixPatternAppender &other);
435};
436
437
438AffixPattern &
439AffixPattern::parseUserAffixString(
440        const UnicodeString &affixStr,
441        AffixPattern &appendTo,
442        UErrorCode &status) {
443    if (U_FAILURE(status)) {
444        return appendTo;
445    }
446    int32_t len = affixStr.length();
447    const UChar *buffer = affixStr.getBuffer();
448    // 0 = not quoted; 1 = quoted.
449    int32_t state = 0;
450    AffixPatternAppender appender(appendTo);
451    for (int32_t i = 0; i < len; ) {
452        UChar token;
453        int32_t tokenSize = nextUserToken(buffer, i, len, &token);
454        i += tokenSize;
455        if (token == 0x27 && tokenSize == 1) { // quote
456            state = 1 - state;
457            continue;
458        }
459        if (state == 0) {
460            switch (token) {
461            case 0x25:
462                appender.flush();
463                appendTo.add(kPercent, 1);
464                break;
465            case 0x27:  // double quote
466                appender.append((UChar) 0x27);
467                break;
468            case 0x2030:
469                appender.flush();
470                appendTo.add(kPerMill, 1);
471                break;
472            case 0x2D:
473                appender.flush();
474                appendTo.add(kNegative, 1);
475                break;
476            case 0x2B:
477                appender.flush();
478                appendTo.add(kPositive, 1);
479                break;
480            case 0xA4:
481                appender.flush();
482                appendTo.add(kCurrency, tokenSize);
483                break;
484            default:
485                appender.append(token);
486                break;
487            }
488        } else {
489            switch (token) {
490            case 0x27:  // double quote
491                appender.append((UChar) 0x27);
492                break;
493            case 0xA4: // included b/c tokenSize can be > 1
494                for (int32_t j = 0; j < tokenSize; ++j) {
495                    appender.append((UChar) 0xA4);
496                }
497                break;
498            default:
499                appender.append(token);
500                break;
501            }
502        }
503    }
504    return appendTo;
505}
506
507AffixPattern &
508AffixPattern::parseAffixString(
509        const UnicodeString &affixStr,
510        AffixPattern &appendTo,
511        UErrorCode &status) {
512    if (U_FAILURE(status)) {
513        return appendTo;
514    }
515    int32_t len = affixStr.length();
516    const UChar *buffer = affixStr.getBuffer();
517    for (int32_t i = 0; i < len; ) {
518        UChar token;
519        int32_t tokenSize = nextToken(buffer, i, len, &token);
520        if (tokenSize == 1) {
521            int32_t literalStart = i;
522            ++i;
523            while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
524                ++i;
525            }
526            appendTo.addLiteral(buffer, literalStart, i - literalStart);
527
528            // If we reached end of string, we are done
529            if (i == len) {
530                return appendTo;
531            }
532        }
533        i += tokenSize;
534        switch (token) {
535        case 0x25:
536            appendTo.add(kPercent, 1);
537            break;
538        case 0x2030:
539            appendTo.add(kPerMill, 1);
540            break;
541        case 0x2D:
542            appendTo.add(kNegative, 1);
543            break;
544        case 0x2B:
545            appendTo.add(kPositive, 1);
546            break;
547        case 0xA4:
548            {
549                if (tokenSize - 1 > 3) {
550                    status = U_PARSE_ERROR;
551                    return appendTo;
552                }
553                appendTo.add(kCurrency, tokenSize - 1);
554            }
555            break;
556        default:
557            appendTo.addLiteral(&token, 0, 1);
558            break;
559        }
560    }
561    return appendTo;
562}
563
564AffixPatternIterator &
565AffixPattern::iterator(AffixPatternIterator &result) const {
566    result.nextLiteralIndex = 0;
567    result.lastLiteralLength = 0;
568    result.nextTokenIndex = 0;
569    result.tokens = &tokens;
570    result.literals = &literals;
571    return result;
572}
573
574UBool
575AffixPatternIterator::nextToken() {
576    int32_t tlen = tokens->length();
577    if (nextTokenIndex == tlen) {
578        return FALSE;
579    }
580    ++nextTokenIndex;
581    const UChar *tokenBuffer = tokens->getBuffer();
582    if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
583            AffixPattern::kLiteral) {
584        while (nextTokenIndex < tlen &&
585                UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
586            ++nextTokenIndex;
587        }
588        lastLiteralLength = 0;
589        int32_t i = nextTokenIndex - 1;
590        for (; UNPACK_LONG(tokenBuffer[i]); --i) {
591            lastLiteralLength <<= 8;
592            lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
593        }
594        lastLiteralLength <<= 8;
595        lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
596        nextLiteralIndex += lastLiteralLength;
597    }
598    return TRUE;
599}
600
601AffixPattern::ETokenType
602AffixPatternIterator::getTokenType() const {
603    return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
604}
605
606UnicodeString &
607AffixPatternIterator::getLiteral(UnicodeString &result) const {
608    const UChar *buffer = literals->getBuffer();
609    result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
610    return result;
611}
612
613int32_t
614AffixPatternIterator::getTokenLength() const {
615    const UChar *tokenBuffer = tokens->getBuffer();
616    AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
617    return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
618}
619
620AffixPatternParser::AffixPatternParser()
621        : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
622}
623
624AffixPatternParser::AffixPatternParser(
625        const DecimalFormatSymbols &symbols) {
626    setDecimalFormatSymbols(symbols);
627}
628
629void
630AffixPatternParser::setDecimalFormatSymbols(
631        const DecimalFormatSymbols &symbols) {
632    fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
633    fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
634    fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
635    fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
636}
637
638PluralAffix &
639AffixPatternParser::parse(
640        const AffixPattern &affixPattern,
641        const CurrencyAffixInfo &currencyAffixInfo,
642        PluralAffix &appendTo,
643        UErrorCode &status) const {
644    if (U_FAILURE(status)) {
645        return appendTo;
646    }
647    AffixPatternIterator iter;
648    affixPattern.iterator(iter);
649    UnicodeString literal;
650    while (iter.nextToken()) {
651        switch (iter.getTokenType()) {
652        case AffixPattern::kPercent:
653            appendTo.append(fPercent, UNUM_PERCENT_FIELD);
654            break;
655        case AffixPattern::kPerMill:
656            appendTo.append(fPermill, UNUM_PERMILL_FIELD);
657            break;
658        case AffixPattern::kNegative:
659            appendTo.append(fNegative, UNUM_SIGN_FIELD);
660            break;
661        case AffixPattern::kPositive:
662            appendTo.append(fPositive, UNUM_SIGN_FIELD);
663            break;
664        case AffixPattern::kCurrency:
665            switch (iter.getTokenLength()) {
666                case 1:
667                    appendTo.append(
668                            currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
669                    break;
670                case 2:
671                    appendTo.append(
672                            currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
673                    break;
674                case 3:
675                    appendTo.append(
676                            currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
677                    break;
678                default:
679                    U_ASSERT(FALSE);
680                    break;
681            }
682            break;
683        case AffixPattern::kLiteral:
684            appendTo.append(iter.getLiteral(literal));
685            break;
686        default:
687            U_ASSERT(FALSE);
688            break;
689        }
690    }
691    return appendTo;
692}
693
694
695U_NAMESPACE_END
696#endif /* #if !UCONFIG_NO_FORMATTING */
697