1/*
2*******************************************************************************
3*   Copyright (C) 2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  messagepattern.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2011mar14
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_FORMATTING
18
19#include "unicode/messagepattern.h"
20#include "unicode/unistr.h"
21#include "cmemory.h"
22#include "cstring.h"
23#include "messageimpl.h"
24#include "patternprops.h"
25#include "putilimp.h"
26#include "uassert.h"
27
28U_NAMESPACE_BEGIN
29
30// Unicode character/code point constants ---------------------------------- ***
31
32static const UChar u_pound=0x23;
33static const UChar u_apos=0x27;
34static const UChar u_plus=0x2B;
35static const UChar u_comma=0x2C;
36static const UChar u_minus=0x2D;
37static const UChar u_dot=0x2E;
38static const UChar u_colon=0x3A;
39static const UChar u_lessThan=0x3C;
40static const UChar u_equal=0x3D;
41static const UChar u_A=0x41;
42static const UChar u_C=0x43;
43static const UChar u_E=0x45;
44static const UChar u_H=0x48;
45static const UChar u_I=0x49;
46static const UChar u_L=0x4C;
47static const UChar u_O=0x4F;
48static const UChar u_P=0x50;
49static const UChar u_R=0x52;
50static const UChar u_S=0x53;
51static const UChar u_T=0x54;
52static const UChar u_U=0x55;
53static const UChar u_Z=0x5A;
54static const UChar u_a=0x61;
55static const UChar u_c=0x63;
56static const UChar u_e=0x65;
57static const UChar u_f=0x66;
58static const UChar u_h=0x68;
59static const UChar u_i=0x69;
60static const UChar u_l=0x6C;
61static const UChar u_o=0x6F;
62static const UChar u_p=0x70;
63static const UChar u_r=0x72;
64static const UChar u_s=0x73;
65static const UChar u_t=0x74;
66static const UChar u_u=0x75;
67static const UChar u_z=0x7A;
68static const UChar u_leftCurlyBrace=0x7B;
69static const UChar u_pipe=0x7C;
70static const UChar u_rightCurlyBrace=0x7D;
71static const UChar u_lessOrEqual=0x2264;  // U+2264 is <=
72
73static const UChar kOffsetColon[]={  // "offset:"
74    u_o, u_f, u_f, u_s, u_e, u_t, u_colon
75};
76
77static const UChar kOther[]={  // "other"
78    u_o, u_t, u_h, u_e, u_r
79};
80
81// MessagePatternList ------------------------------------------------------ ***
82
83template<typename T, int32_t stackCapacity>
84class MessagePatternList : public UMemory {
85public:
86    MessagePatternList() {}
87    void copyFrom(const MessagePatternList<T, stackCapacity> &other,
88                  int32_t length,
89                  UErrorCode &errorCode);
90    UBool ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode);
91    UBool memEquals(const MessagePatternList<T, stackCapacity> &other, int32_t length) const {
92        return 0==uprv_memcmp(a.getAlias(), other.a.getAlias(), length*sizeof(T));
93    }
94
95    MaybeStackArray<T, stackCapacity> a;
96};
97
98template<typename T, int32_t stackCapacity>
99void
100MessagePatternList<T, stackCapacity>::copyFrom(
101        const MessagePatternList<T, stackCapacity> &other,
102        int32_t length,
103        UErrorCode &errorCode) {
104    if(U_SUCCESS(errorCode) && length>0) {
105        if(length>a.getCapacity() && NULL==a.resize(length)) {
106            errorCode=U_MEMORY_ALLOCATION_ERROR;
107            return;
108        }
109        uprv_memcpy(a.getAlias(), other.a.getAlias(), length*sizeof(T));
110    }
111}
112
113template<typename T, int32_t stackCapacity>
114UBool
115MessagePatternList<T, stackCapacity>::ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode) {
116    if(U_FAILURE(errorCode)) {
117        return FALSE;
118    }
119    if(a.getCapacity()>oldLength || a.resize(2*oldLength, oldLength)!=NULL) {
120        return TRUE;
121    }
122    errorCode=U_MEMORY_ALLOCATION_ERROR;
123    return FALSE;
124}
125
126// MessagePatternList specializations -------------------------------------- ***
127
128class MessagePatternDoubleList : public MessagePatternList<double, 8> {
129};
130
131class MessagePatternPartsList : public MessagePatternList<MessagePattern::Part, 32> {
132};
133
134// MessagePattern constructors etc. ---------------------------------------- ***
135
136MessagePattern::MessagePattern(UErrorCode &errorCode)
137        : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE),
138          partsList(NULL), parts(NULL), partsLength(0),
139          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
140          hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) {
141    init(errorCode);
142}
143
144MessagePattern::MessagePattern(UMessagePatternApostropheMode mode, UErrorCode &errorCode)
145        : aposMode(mode),
146          partsList(NULL), parts(NULL), partsLength(0),
147          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
148          hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) {
149    init(errorCode);
150}
151
152MessagePattern::MessagePattern(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode)
153        : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE),
154          partsList(NULL), parts(NULL), partsLength(0),
155          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
156          hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) {
157    if(init(errorCode)) {
158        parse(pattern, parseError, errorCode);
159    }
160}
161
162UBool
163MessagePattern::init(UErrorCode &errorCode) {
164    if(U_FAILURE(errorCode)) {
165        return FALSE;
166    }
167    partsList=new MessagePatternPartsList();
168    if(partsList==NULL) {
169        errorCode=U_MEMORY_ALLOCATION_ERROR;
170        return FALSE;
171    }
172    parts=partsList->a.getAlias();
173    return TRUE;
174}
175
176MessagePattern::MessagePattern(const MessagePattern &other)
177        : aposMode(other.aposMode), msg(other.msg),
178          partsList(NULL), parts(NULL), partsLength(0),
179          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
180          hasArgNames(other.hasArgNames), hasArgNumbers(other.hasArgNumbers),
181          needsAutoQuoting(other.needsAutoQuoting) {
182    UErrorCode errorCode=U_ZERO_ERROR;
183    if(!copyStorage(other, errorCode)) {
184        clear();
185    }
186}
187
188MessagePattern &
189MessagePattern::operator=(const MessagePattern &other) {
190    if(this==&other) {
191        return *this;
192    }
193    aposMode=other.aposMode;
194    msg=other.msg;
195    hasArgNames=other.hasArgNames;
196    hasArgNumbers=other.hasArgNumbers;
197    needsAutoQuoting=other.needsAutoQuoting;
198    UErrorCode errorCode=U_ZERO_ERROR;
199    if(!copyStorage(other, errorCode)) {
200        clear();
201    }
202    return *this;
203}
204
205UBool
206MessagePattern::copyStorage(const MessagePattern &other, UErrorCode &errorCode) {
207    if(U_FAILURE(errorCode)) {
208        return FALSE;
209    }
210    parts=NULL;
211    partsLength=0;
212    numericValues=NULL;
213    numericValuesLength=0;
214    if(partsList==NULL) {
215        partsList=new MessagePatternPartsList();
216        if(partsList==NULL) {
217            errorCode=U_MEMORY_ALLOCATION_ERROR;
218            return FALSE;
219        }
220        parts=partsList->a.getAlias();
221    }
222    if(other.partsLength>0) {
223        partsList->copyFrom(*other.partsList, other.partsLength, errorCode);
224        if(U_FAILURE(errorCode)) {
225            return FALSE;
226        }
227        parts=partsList->a.getAlias();
228        partsLength=other.partsLength;
229    }
230    if(other.numericValuesLength>0) {
231        if(numericValuesList==NULL) {
232            numericValuesList=new MessagePatternDoubleList();
233            if(numericValuesList==NULL) {
234                errorCode=U_MEMORY_ALLOCATION_ERROR;
235                return FALSE;
236            }
237            numericValues=numericValuesList->a.getAlias();
238        }
239        numericValuesList->copyFrom(
240            *other.numericValuesList, other.numericValuesLength, errorCode);
241        if(U_FAILURE(errorCode)) {
242            return FALSE;
243        }
244        numericValues=numericValuesList->a.getAlias();
245        numericValuesLength=other.numericValuesLength;
246    }
247    return TRUE;
248}
249
250MessagePattern::~MessagePattern() {
251    delete partsList;
252    delete numericValuesList;
253}
254
255// MessagePattern API ------------------------------------------------------ ***
256
257MessagePattern &
258MessagePattern::parse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) {
259    preParse(pattern, parseError, errorCode);
260    parseMessage(0, 0, 0, UMSGPAT_ARG_TYPE_NONE, parseError, errorCode);
261    postParse();
262    return *this;
263}
264
265MessagePattern &
266MessagePattern::parseChoiceStyle(const UnicodeString &pattern,
267                                 UParseError *parseError, UErrorCode &errorCode) {
268    preParse(pattern, parseError, errorCode);
269    parseChoiceStyle(0, 0, parseError, errorCode);
270    postParse();
271    return *this;
272}
273
274MessagePattern &
275MessagePattern::parsePluralStyle(const UnicodeString &pattern,
276                                 UParseError *parseError, UErrorCode &errorCode) {
277    preParse(pattern, parseError, errorCode);
278    parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_PLURAL, 0, 0, parseError, errorCode);
279    postParse();
280    return *this;
281}
282
283MessagePattern &
284MessagePattern::parseSelectStyle(const UnicodeString &pattern,
285                                 UParseError *parseError, UErrorCode &errorCode) {
286    preParse(pattern, parseError, errorCode);
287    parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_SELECT, 0, 0, parseError, errorCode);
288    postParse();
289    return *this;
290}
291
292void
293MessagePattern::clear() {
294    // Mostly the same as preParse().
295    msg.remove();
296    hasArgNames=hasArgNumbers=FALSE;
297    needsAutoQuoting=FALSE;
298    partsLength=0;
299    numericValuesLength=0;
300}
301
302UBool
303MessagePattern::operator==(const MessagePattern &other) const {
304    if(this==&other) {
305        return TRUE;
306    }
307    return
308        aposMode==other.aposMode &&
309        msg==other.msg &&
310        // parts.equals(o.parts)
311        partsLength==other.partsLength &&
312        (partsLength==0 || partsList->memEquals(*other.partsList, partsLength));
313    // No need to compare numericValues if msg and parts are the same.
314}
315
316int32_t
317MessagePattern::hashCode() const {
318    int32_t hash=(aposMode*37+msg.hashCode())*37+partsLength;
319    for(int32_t i=0; i<partsLength; ++i) {
320        hash=hash*37+parts[i].hashCode();
321    }
322    return hash;
323}
324
325int32_t
326MessagePattern::validateArgumentName(const UnicodeString &name) {
327    if(!PatternProps::isIdentifier(name.getBuffer(), name.length())) {
328        return UMSGPAT_ARG_NAME_NOT_VALID;
329    }
330    return parseArgNumber(name, 0, name.length());
331}
332
333UnicodeString
334MessagePattern::autoQuoteApostropheDeep() const {
335    if(!needsAutoQuoting) {
336        return msg;
337    }
338    UnicodeString modified(msg);
339    // Iterate backward so that the insertion indexes do not change.
340    int32_t count=countParts();
341    for(int32_t i=count; i>0;) {
342        const Part &part=getPart(--i);
343        if(part.getType()==UMSGPAT_PART_TYPE_INSERT_CHAR) {
344           modified.insert(part.index, (UChar)part.value);
345        }
346    }
347    return modified;
348}
349
350double
351MessagePattern::getNumericValue(const Part &part) const {
352    UMessagePatternPartType type=part.type;
353    if(type==UMSGPAT_PART_TYPE_ARG_INT) {
354        return part.value;
355    } else if(type==UMSGPAT_PART_TYPE_ARG_DOUBLE) {
356        return numericValues[part.value];
357    } else {
358        return UMSGPAT_NO_NUMERIC_VALUE;
359    }
360}
361
362/**
363  * Returns the "offset:" value of a PluralFormat argument, or 0 if none is specified.
364  * @param pluralStart the index of the first PluralFormat argument style part. (0..countParts()-1)
365  * @return the "offset:" value.
366  * @draft ICU 4.8
367  */
368double
369MessagePattern::getPluralOffset(int32_t pluralStart) const {
370    const Part &part=getPart(pluralStart);
371    if(Part::hasNumericValue(part.type)) {
372        return getNumericValue(part);
373    } else {
374        return 0;
375    }
376}
377
378// MessagePattern::Part ---------------------------------------------------- ***
379
380UBool
381MessagePattern::Part::operator==(const Part &other) const {
382    if(this==&other) {
383        return TRUE;
384    }
385    return
386        type==other.type &&
387        index==other.index &&
388        length==other.length &&
389        value==other.value &&
390        limitPartIndex==other.limitPartIndex;
391}
392
393// MessagePattern parser --------------------------------------------------- ***
394
395void
396MessagePattern::preParse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) {
397    if(U_FAILURE(errorCode)) {
398        return;
399    }
400    if(parseError!=NULL) {
401        parseError->line=0;
402        parseError->offset=0;
403        parseError->preContext[0]=0;
404        parseError->postContext[0]=0;
405    }
406    msg=pattern;
407    hasArgNames=hasArgNumbers=FALSE;
408    needsAutoQuoting=FALSE;
409    partsLength=0;
410    numericValuesLength=0;
411}
412
413void
414MessagePattern::postParse() {
415    if(partsList!=NULL) {
416        parts=partsList->a.getAlias();
417    }
418    if(numericValuesList!=NULL) {
419        numericValues=numericValuesList->a.getAlias();
420    }
421}
422
423int32_t
424MessagePattern::parseMessage(int32_t index, int32_t msgStartLength,
425                             int32_t nestingLevel, UMessagePatternArgType parentType,
426                             UParseError *parseError, UErrorCode &errorCode) {
427    if(U_FAILURE(errorCode)) {
428        return 0;
429    }
430    if(nestingLevel>Part::MAX_VALUE) {
431        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
432        return 0;
433    }
434    int32_t msgStart=partsLength;
435    addPart(UMSGPAT_PART_TYPE_MSG_START, index, msgStartLength, nestingLevel, errorCode);
436    index+=msgStartLength;
437    for(;;) {  // while(index<msg.length()) with U_FAILURE(errorCode) check
438        if(U_FAILURE(errorCode)) {
439            return 0;
440        }
441        if(index>=msg.length()) {
442            break;
443        }
444        UChar c=msg.charAt(index++);
445        if(c==u_apos) {
446            if(index==msg.length()) {
447                // The apostrophe is the last character in the pattern.
448                // Add a Part for auto-quoting.
449                addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0,
450                        u_apos, errorCode);  // value=char to be inserted
451                needsAutoQuoting=TRUE;
452            } else {
453                c=msg.charAt(index);
454                if(c==u_apos) {
455                    // double apostrophe, skip the second one
456                    addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode);
457                } else if(
458                    aposMode==UMSGPAT_APOS_DOUBLE_REQUIRED ||
459                    c==u_leftCurlyBrace || c==u_rightCurlyBrace ||
460                    (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe) ||
461                    (parentType==UMSGPAT_ARG_TYPE_PLURAL && c==u_pound)
462                ) {
463                    // skip the quote-starting apostrophe
464                    addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index-1, 1, 0, errorCode);
465                    // find the end of the quoted literal text
466                    for(;;) {
467                        index=msg.indexOf(u_apos, index+1);
468                        if(index>=0) {
469                            if(/*(index+1)<msg.length() &&*/ msg.charAt(index+1)==u_apos) {
470                                // double apostrophe inside quoted literal text
471                                // still encodes a single apostrophe, skip the second one
472                                addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, ++index, 1, 0, errorCode);
473                            } else {
474                                // skip the quote-ending apostrophe
475                                addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode);
476                                break;
477                            }
478                        } else {
479                            // The quoted text reaches to the end of the of the message.
480                            index=msg.length();
481                            // Add a Part for auto-quoting.
482                            addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0,
483                                    u_apos, errorCode);  // value=char to be inserted
484                            needsAutoQuoting=TRUE;
485                            break;
486                        }
487                    }
488                } else {
489                    // Interpret the apostrophe as literal text.
490                    // Add a Part for auto-quoting.
491                    addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0,
492                            u_apos, errorCode);  // value=char to be inserted
493                    needsAutoQuoting=TRUE;
494                }
495            }
496        } else if(parentType==UMSGPAT_ARG_TYPE_PLURAL && c==u_pound) {
497            // The unquoted # in a plural message fragment will be replaced
498            // with the (number-offset).
499            addPart(UMSGPAT_PART_TYPE_REPLACE_NUMBER, index-1, 1, 0, errorCode);
500        } else if(c==u_leftCurlyBrace) {
501            index=parseArg(index-1, 1, nestingLevel, parseError, errorCode);
502        } else if((nestingLevel>0 && c==u_rightCurlyBrace) ||
503                  (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe)) {
504            // Finish the message before the terminator.
505            // In a choice style, report the "}" substring only for the following ARG_LIMIT,
506            // not for this MSG_LIMIT.
507            int32_t limitLength=(parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_rightCurlyBrace) ? 0 : 1;
508            addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index-1, limitLength,
509                         nestingLevel, errorCode);
510            if(parentType==UMSGPAT_ARG_TYPE_CHOICE) {
511                // Let the choice style parser see the '}' or '|'.
512                return index-1;
513            } else {
514                // continue parsing after the '}'
515                return index;
516            }
517        }  // else: c is part of literal text
518    }
519    if(nestingLevel>0 && !inTopLevelChoiceMessage(nestingLevel, parentType)) {
520        setParseError(parseError, 0);  // Unmatched '{' braces in message.
521        errorCode=U_UNMATCHED_BRACES;
522        return 0;
523    }
524    addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index, 0, nestingLevel, errorCode);
525    return index;
526}
527
528int32_t
529MessagePattern::parseArg(int32_t index, int32_t argStartLength, int32_t nestingLevel,
530                         UParseError *parseError, UErrorCode &errorCode) {
531    int32_t argStart=partsLength;
532    UMessagePatternArgType argType=UMSGPAT_ARG_TYPE_NONE;
533    addPart(UMSGPAT_PART_TYPE_ARG_START, index, argStartLength, argType, errorCode);
534    if(U_FAILURE(errorCode)) {
535        return 0;
536    }
537    int32_t nameIndex=index=skipWhiteSpace(index+argStartLength);
538    if(index==msg.length()) {
539        setParseError(parseError, 0);  // Unmatched '{' braces in message.
540        errorCode=U_UNMATCHED_BRACES;
541        return 0;
542    }
543    // parse argument name or number
544    index=skipIdentifier(index);
545    int32_t number=parseArgNumber(nameIndex, index);
546    if(number>=0) {
547        int32_t length=index-nameIndex;
548        if(length>Part::MAX_LENGTH || number>Part::MAX_VALUE) {
549            setParseError(parseError, nameIndex);  // Argument number too large.
550            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
551            return 0;
552        }
553        hasArgNumbers=TRUE;
554        addPart(UMSGPAT_PART_TYPE_ARG_NUMBER, nameIndex, length, number, errorCode);
555    } else if(number==UMSGPAT_ARG_NAME_NOT_NUMBER) {
556        int32_t length=index-nameIndex;
557        if(length>Part::MAX_LENGTH) {
558            setParseError(parseError, nameIndex);  // Argument name too long.
559            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
560            return 0;
561        }
562        hasArgNames=TRUE;
563        addPart(UMSGPAT_PART_TYPE_ARG_NAME, nameIndex, length, 0, errorCode);
564    } else {  // number<-1 (ARG_NAME_NOT_VALID)
565        setParseError(parseError, nameIndex);  // Bad argument syntax.
566        errorCode=U_PATTERN_SYNTAX_ERROR;
567        return 0;
568    }
569    index=skipWhiteSpace(index);
570    if(index==msg.length()) {
571        setParseError(parseError, 0);  // Unmatched '{' braces in message.
572        errorCode=U_UNMATCHED_BRACES;
573        return 0;
574    }
575    UChar c=msg.charAt(index);
576    if(c==u_rightCurlyBrace) {
577        // all done
578    } else if(c!=u_comma) {
579        setParseError(parseError, nameIndex);  // Bad argument syntax.
580        errorCode=U_PATTERN_SYNTAX_ERROR;
581        return 0;
582    } else /* ',' */ {
583        // parse argument type: case-sensitive a-zA-Z
584        int32_t typeIndex=index=skipWhiteSpace(index+1);
585        while(index<msg.length() && isArgTypeChar(msg.charAt(index))) {
586            ++index;
587        }
588        int32_t length=index-typeIndex;
589        index=skipWhiteSpace(index);
590        if(index==msg.length()) {
591            setParseError(parseError, 0);  // Unmatched '{' braces in message.
592            errorCode=U_UNMATCHED_BRACES;
593            return 0;
594        }
595        if(length==0 || ((c=msg.charAt(index))!=u_comma && c!=u_rightCurlyBrace)) {
596            setParseError(parseError, nameIndex);  // Bad argument syntax.
597            errorCode=U_PATTERN_SYNTAX_ERROR;
598            return 0;
599        }
600        if(length>Part::MAX_LENGTH) {
601            setParseError(parseError, nameIndex);  // Argument type name too long.
602            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
603            return 0;
604        }
605        argType=UMSGPAT_ARG_TYPE_SIMPLE;
606        if(length==6) {
607            // case-insensitive comparisons for complex-type names
608            if(isChoice(typeIndex)) {
609                argType=UMSGPAT_ARG_TYPE_CHOICE;
610            } else if(isPlural(typeIndex)) {
611                argType=UMSGPAT_ARG_TYPE_PLURAL;
612            } else if(isSelect(typeIndex)) {
613                argType=UMSGPAT_ARG_TYPE_SELECT;
614            }
615        }
616        // change the ARG_START type from NONE to argType
617        partsList->a[argStart].value=(int16_t)argType;
618        if(argType==UMSGPAT_ARG_TYPE_SIMPLE) {
619            addPart(UMSGPAT_PART_TYPE_ARG_TYPE, typeIndex, length, 0, errorCode);
620        }
621        // look for an argument style (pattern)
622        if(c==u_rightCurlyBrace) {
623            if(argType!=UMSGPAT_ARG_TYPE_SIMPLE) {
624                setParseError(parseError, nameIndex);  // No style field for complex argument.
625                errorCode=U_PATTERN_SYNTAX_ERROR;
626                return 0;
627            }
628        } else /* ',' */ {
629            ++index;
630            if(argType==UMSGPAT_ARG_TYPE_SIMPLE) {
631                index=parseSimpleStyle(index, parseError, errorCode);
632            } else if(argType==UMSGPAT_ARG_TYPE_CHOICE) {
633                index=parseChoiceStyle(index, nestingLevel, parseError, errorCode);
634            } else {
635                index=parsePluralOrSelectStyle(argType, index, nestingLevel, parseError, errorCode);
636            }
637        }
638    }
639    // Argument parsing stopped on the '}'.
640    addLimitPart(argStart, UMSGPAT_PART_TYPE_ARG_LIMIT, index, 1, argType, errorCode);
641    return index+1;
642}
643
644int32_t
645MessagePattern::parseSimpleStyle(int32_t index, UParseError *parseError, UErrorCode &errorCode) {
646    if(U_FAILURE(errorCode)) {
647        return 0;
648    }
649    int32_t start=index;
650    int32_t nestedBraces=0;
651    while(index<msg.length()) {
652        UChar c=msg.charAt(index++);
653        if(c==u_apos) {
654            // Treat apostrophe as quoting but include it in the style part.
655            // Find the end of the quoted literal text.
656            index=msg.indexOf(u_apos, index);
657            if(index<0) {
658                // Quoted literal argument style text reaches to the end of the message.
659                setParseError(parseError, start);
660                errorCode=U_PATTERN_SYNTAX_ERROR;
661                return 0;
662            }
663            // skip the quote-ending apostrophe
664            ++index;
665        } else if(c==u_leftCurlyBrace) {
666            ++nestedBraces;
667        } else if(c==u_rightCurlyBrace) {
668            if(nestedBraces>0) {
669                --nestedBraces;
670            } else {
671                int32_t length=--index-start;
672                if(length>Part::MAX_LENGTH) {
673                    setParseError(parseError, start);  // Argument style text too long.
674                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
675                    return 0;
676                }
677                addPart(UMSGPAT_PART_TYPE_ARG_STYLE, start, length, 0, errorCode);
678                return index;
679            }
680        }  // c is part of literal text
681    }
682    setParseError(parseError, 0);  // Unmatched '{' braces in message.
683    errorCode=U_UNMATCHED_BRACES;
684    return 0;
685}
686
687int32_t
688MessagePattern::parseChoiceStyle(int32_t index, int32_t nestingLevel,
689                                 UParseError *parseError, UErrorCode &errorCode) {
690    if(U_FAILURE(errorCode)) {
691        return 0;
692    }
693    int32_t start=index;
694    index=skipWhiteSpace(index);
695    if(index==msg.length() || msg.charAt(index)==u_rightCurlyBrace) {
696        setParseError(parseError, 0);  // Missing choice argument pattern.
697        errorCode=U_PATTERN_SYNTAX_ERROR;
698        return 0;
699    }
700    for(;;) {
701        // The choice argument style contains |-separated (number, separator, message) triples.
702        // Parse the number.
703        int32_t numberIndex=index;
704        index=skipDouble(index);
705        int32_t length=index-numberIndex;
706        if(length==0) {
707            setParseError(parseError, start);  // Bad choice pattern syntax.
708            errorCode=U_PATTERN_SYNTAX_ERROR;
709            return 0;
710        }
711        if(length>Part::MAX_LENGTH) {
712            setParseError(parseError, numberIndex);  // Choice number too long.
713            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
714            return 0;
715        }
716        parseDouble(numberIndex, index, TRUE, parseError, errorCode);  // adds ARG_INT or ARG_DOUBLE
717        if(U_FAILURE(errorCode)) {
718            return 0;
719        }
720        // Parse the separator.
721        index=skipWhiteSpace(index);
722        if(index==msg.length()) {
723            setParseError(parseError, start);  // Bad choice pattern syntax.
724            errorCode=U_PATTERN_SYNTAX_ERROR;
725            return 0;
726        }
727        UChar c=msg.charAt(index);
728        if(!(c==u_pound || c==u_lessThan || c==u_lessOrEqual)) {  // U+2264 is <=
729            setParseError(parseError, start);  // Expected choice separator (#<\u2264) instead of c.
730            errorCode=U_PATTERN_SYNTAX_ERROR;
731            return 0;
732        }
733        addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, index, 1, 0, errorCode);
734        // Parse the message fragment.
735        index=parseMessage(++index, 0, nestingLevel+1, UMSGPAT_ARG_TYPE_CHOICE, parseError, errorCode);
736        if(U_FAILURE(errorCode)) {
737            return 0;
738        }
739        // parseMessage(..., CHOICE) returns the index of the terminator, or msg.length().
740        if(index==msg.length()) {
741            return index;
742        }
743        if(msg.charAt(index)==u_rightCurlyBrace) {
744            if(!inMessageFormatPattern(nestingLevel)) {
745                setParseError(parseError, start);  // Bad choice pattern syntax.
746                errorCode=U_PATTERN_SYNTAX_ERROR;
747                return 0;
748            }
749            return index;
750        }  // else the terminator is '|'
751        index=skipWhiteSpace(index+1);
752    }
753}
754
755int32_t
756MessagePattern::parsePluralOrSelectStyle(UMessagePatternArgType argType,
757                                         int32_t index, int32_t nestingLevel,
758                                         UParseError *parseError, UErrorCode &errorCode) {
759    if(U_FAILURE(errorCode)) {
760        return 0;
761    }
762    int32_t start=index;
763    UBool isEmpty=TRUE;
764    UBool hasOther=FALSE;
765    for(;;) {
766        // First, collect the selector looking for a small set of terminators.
767        // It would be a little faster to consider the syntax of each possible
768        // token right here, but that makes the code too complicated.
769        index=skipWhiteSpace(index);
770        UBool eos=index==msg.length();
771        if(eos || msg.charAt(index)==u_rightCurlyBrace) {
772            if(eos==inMessageFormatPattern(nestingLevel)) {
773                setParseError(parseError, start);  // Bad plural/select pattern syntax.
774                errorCode=U_PATTERN_SYNTAX_ERROR;
775                return 0;
776            }
777            if(!hasOther) {
778                setParseError(parseError, 0);  // Missing 'other' keyword in plural/select pattern.
779                errorCode=U_DEFAULT_KEYWORD_MISSING;
780                return 0;
781            }
782            return index;
783        }
784        int32_t selectorIndex=index;
785        if(argType==UMSGPAT_ARG_TYPE_PLURAL && msg.charAt(selectorIndex)==u_equal) {
786            // explicit-value plural selector: =double
787            index=skipDouble(index+1);
788            int32_t length=index-selectorIndex;
789            if(length==1) {
790                setParseError(parseError, start);  // Bad plural/select pattern syntax.
791                errorCode=U_PATTERN_SYNTAX_ERROR;
792                return 0;
793            }
794            if(length>Part::MAX_LENGTH) {
795                setParseError(parseError, selectorIndex);  // Argument selector too long.
796                errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
797                return 0;
798            }
799            addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode);
800            parseDouble(selectorIndex+1, index, FALSE,
801                        parseError, errorCode);  // adds ARG_INT or ARG_DOUBLE
802        } else {
803            index=skipIdentifier(index);
804            int32_t length=index-selectorIndex;
805            if(length==0) {
806                setParseError(parseError, start);  // Bad plural/select pattern syntax.
807                errorCode=U_PATTERN_SYNTAX_ERROR;
808                return 0;
809            }
810            // Note: The ':' in "offset:" is just beyond the skipIdentifier() range.
811            if( argType==UMSGPAT_ARG_TYPE_PLURAL && length==6 && index<msg.length() &&
812                0==msg.compare(selectorIndex, 7, kOffsetColon, 0, 7)
813            ) {
814                // plural offset, not a selector
815                if(!isEmpty) {
816                    // Plural argument 'offset:' (if present) must precede key-message pairs.
817                    setParseError(parseError, start);
818                    errorCode=U_PATTERN_SYNTAX_ERROR;
819                    return 0;
820                }
821                // allow whitespace between offset: and its value
822                int32_t valueIndex=skipWhiteSpace(index+1);  // The ':' is at index.
823                index=skipDouble(valueIndex);
824                if(index==valueIndex) {
825                    setParseError(parseError, start);  // Missing value for plural 'offset:'.
826                    errorCode=U_PATTERN_SYNTAX_ERROR;
827                    return 0;
828                }
829                if((index-valueIndex)>Part::MAX_LENGTH) {
830                    setParseError(parseError, valueIndex);  // Plural offset value too long.
831                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
832                    return 0;
833                }
834                parseDouble(valueIndex, index, FALSE,
835                            parseError, errorCode);  // adds ARG_INT or ARG_DOUBLE
836                if(U_FAILURE(errorCode)) {
837                    return 0;
838                }
839                isEmpty=FALSE;
840                continue;  // no message fragment after the offset
841            } else {
842                // normal selector word
843                if(length>Part::MAX_LENGTH) {
844                    setParseError(parseError, selectorIndex);  // Argument selector too long.
845                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
846                    return 0;
847                }
848                addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode);
849                if(0==msg.compare(selectorIndex, length, kOther, 0, 5)) {
850                    hasOther=TRUE;
851                }
852            }
853        }
854        if(U_FAILURE(errorCode)) {
855            return 0;
856        }
857
858        // parse the message fragment following the selector
859        index=skipWhiteSpace(index);
860        if(index==msg.length() || msg.charAt(index)!=u_leftCurlyBrace) {
861            setParseError(parseError, selectorIndex);  // No message fragment after plural/select selector.
862            errorCode=U_PATTERN_SYNTAX_ERROR;
863            return 0;
864        }
865        index=parseMessage(index, 1, nestingLevel+1, argType, parseError, errorCode);
866        if(U_FAILURE(errorCode)) {
867            return 0;
868        }
869        isEmpty=FALSE;
870    }
871}
872
873int32_t
874MessagePattern::parseArgNumber(const UnicodeString &s, int32_t start, int32_t limit) {
875    // If the identifier contains only ASCII digits, then it is an argument _number_
876    // and must not have leading zeros (except "0" itself).
877    // Otherwise it is an argument _name_.
878    if(start>=limit) {
879        return UMSGPAT_ARG_NAME_NOT_VALID;
880    }
881    int32_t number;
882    // Defer numeric errors until we know there are only digits.
883    UBool badNumber;
884    UChar c=s.charAt(start++);
885    if(c==0x30) {
886        if(start==limit) {
887            return 0;
888        } else {
889            number=0;
890            badNumber=TRUE;  // leading zero
891        }
892    } else if(0x31<=c && c<=0x39) {
893        number=c-0x30;
894        badNumber=FALSE;
895    } else {
896        return UMSGPAT_ARG_NAME_NOT_NUMBER;
897    }
898    while(start<limit) {
899        c=s.charAt(start++);
900        if(0x30<=c && c<=0x39) {
901            if(number>=INT32_MAX/10) {
902                badNumber=TRUE;  // overflow
903            }
904            number=number*10+(c-0x30);
905        } else {
906            return UMSGPAT_ARG_NAME_NOT_NUMBER;
907        }
908    }
909    // There are only ASCII digits.
910    if(badNumber) {
911        return UMSGPAT_ARG_NAME_NOT_VALID;
912    } else {
913        return number;
914    }
915}
916
917void
918MessagePattern::parseDouble(int32_t start, int32_t limit, UBool allowInfinity,
919                            UParseError *parseError, UErrorCode &errorCode) {
920    if(U_FAILURE(errorCode)) {
921        return;
922    }
923    U_ASSERT(start<limit);
924    // fake loop for easy exit and single throw statement
925    for(;;) {
926        // fast path for small integers and infinity
927        int32_t value=0;
928        int32_t isNegative=0;  // not boolean so that we can easily add it to value
929        int32_t index=start;
930        UChar c=msg.charAt(index++);
931        if(c==u_minus) {
932            isNegative=1;
933            if(index==limit) {
934                break;  // no number
935            }
936            c=msg.charAt(index++);
937        } else if(c==u_plus) {
938            if(index==limit) {
939                break;  // no number
940            }
941            c=msg.charAt(index++);
942        }
943        if(c==0x221e) {  // infinity
944            if(allowInfinity && index==limit) {
945                double infinity=uprv_getInfinity();
946                addArgDoublePart(
947                    isNegative!=0 ? -infinity : infinity,
948                    start, limit-start, errorCode);
949                return;
950            } else {
951                break;
952            }
953        }
954        // try to parse the number as a small integer but fall back to a double
955        while('0'<=c && c<='9') {
956            value=value*10+(c-'0');
957            if(value>(Part::MAX_VALUE+isNegative)) {
958                break;  // not a small-enough integer
959            }
960            if(index==limit) {
961                addPart(UMSGPAT_PART_TYPE_ARG_INT, start, limit-start,
962                        isNegative!=0 ? -value : value, errorCode);
963                return;
964            }
965            c=msg.charAt(index++);
966        }
967        // Let Double.parseDouble() throw a NumberFormatException.
968        char numberChars[128];
969        int32_t capacity=(int32_t)sizeof(numberChars);
970        int32_t length=limit-start;
971        if(length>=capacity) {
972            break;  // number too long
973        }
974        msg.extract(start, length, numberChars, capacity, US_INV);
975        if((int32_t)uprv_strlen(numberChars)<length) {
976            break;  // contains non-invariant character that was turned into NUL
977        }
978        char *end;
979        double numericValue=uprv_strtod(numberChars, &end);
980        if(end!=(numberChars+length)) {
981            break;  // parsing error
982        }
983        addArgDoublePart(numericValue, start, length, errorCode);
984        return;
985    }
986    setParseError(parseError, start /*, limit*/);  // Bad syntax for numeric value.
987    errorCode=U_PATTERN_SYNTAX_ERROR;
988    return;
989}
990
991int32_t
992MessagePattern::skipWhiteSpace(int32_t index) {
993    const UChar *s=msg.getBuffer();
994    int32_t msgLength=msg.length();
995    const UChar *t=PatternProps::skipWhiteSpace(s+index, msgLength-index);
996    return (int32_t)(t-s);
997}
998
999int32_t
1000MessagePattern::skipIdentifier(int32_t index) {
1001    const UChar *s=msg.getBuffer();
1002    int32_t msgLength=msg.length();
1003    const UChar *t=PatternProps::skipIdentifier(s+index, msgLength-index);
1004    return (int32_t)(t-s);
1005}
1006
1007int32_t
1008MessagePattern::skipDouble(int32_t index) {
1009    int32_t msgLength=msg.length();
1010    while(index<msgLength) {
1011        UChar c=msg.charAt(index);
1012        // U+221E: Allow the infinity symbol, for ChoiceFormat patterns.
1013        if((c<0x30 && c!=u_plus && c!=u_minus && c!=u_dot) || (c>0x39 && c!=u_e && c!=u_E && c!=0x221e)) {
1014            break;
1015        }
1016        ++index;
1017    }
1018    return index;
1019}
1020
1021UBool
1022MessagePattern::isArgTypeChar(UChar32 c) {
1023    return (u_a<=c && c<=u_z) || (u_A<=c && c<=u_Z);
1024}
1025
1026UBool
1027MessagePattern::isChoice(int32_t index) {
1028    UChar c;
1029    return
1030        ((c=msg.charAt(index++))==u_c || c==u_C) &&
1031        ((c=msg.charAt(index++))==u_h || c==u_H) &&
1032        ((c=msg.charAt(index++))==u_o || c==u_O) &&
1033        ((c=msg.charAt(index++))==u_i || c==u_I) &&
1034        ((c=msg.charAt(index++))==u_c || c==u_C) &&
1035        ((c=msg.charAt(index))==u_e || c==u_E);
1036}
1037
1038UBool
1039MessagePattern::isPlural(int32_t index) {
1040    UChar c;
1041    return
1042        ((c=msg.charAt(index++))==u_p || c==u_P) &&
1043        ((c=msg.charAt(index++))==u_l || c==u_L) &&
1044        ((c=msg.charAt(index++))==u_u || c==u_U) &&
1045        ((c=msg.charAt(index++))==u_r || c==u_R) &&
1046        ((c=msg.charAt(index++))==u_a || c==u_A) &&
1047        ((c=msg.charAt(index))==u_l || c==u_L);
1048}
1049
1050UBool
1051MessagePattern::isSelect(int32_t index) {
1052    UChar c;
1053    return
1054        ((c=msg.charAt(index++))==u_s || c==u_S) &&
1055        ((c=msg.charAt(index++))==u_e || c==u_E) &&
1056        ((c=msg.charAt(index++))==u_l || c==u_L) &&
1057        ((c=msg.charAt(index++))==u_e || c==u_E) &&
1058        ((c=msg.charAt(index++))==u_c || c==u_C) &&
1059        ((c=msg.charAt(index))==u_t || c==u_T);
1060}
1061
1062UBool
1063MessagePattern::inMessageFormatPattern(int32_t nestingLevel) {
1064    return nestingLevel>0 || partsList->a[0].type==UMSGPAT_PART_TYPE_MSG_START;
1065}
1066
1067UBool
1068MessagePattern::inTopLevelChoiceMessage(int32_t nestingLevel, UMessagePatternArgType parentType) {
1069    return
1070        nestingLevel==1 &&
1071        parentType==UMSGPAT_ARG_TYPE_CHOICE &&
1072        partsList->a[0].type!=UMSGPAT_PART_TYPE_MSG_START;
1073}
1074
1075void
1076MessagePattern::addPart(UMessagePatternPartType type, int32_t index, int32_t length,
1077                        int32_t value, UErrorCode &errorCode) {
1078    if(partsList->ensureCapacityForOneMore(partsLength, errorCode)) {
1079        Part &part=partsList->a[partsLength++];
1080        part.type=type;
1081        part.index=index;
1082        part.length=(uint16_t)length;
1083        part.value=(int16_t)value;
1084        part.limitPartIndex=0;
1085    }
1086}
1087
1088void
1089MessagePattern::addLimitPart(int32_t start,
1090                             UMessagePatternPartType type, int32_t index, int32_t length,
1091                             int32_t value, UErrorCode &errorCode) {
1092    partsList->a[start].limitPartIndex=partsLength;
1093    addPart(type, index, length, value, errorCode);
1094}
1095
1096void
1097MessagePattern::addArgDoublePart(double numericValue, int32_t start, int32_t length,
1098                                 UErrorCode &errorCode) {
1099    if(U_FAILURE(errorCode)) {
1100        return;
1101    }
1102    int32_t numericIndex=numericValuesLength;
1103    if(numericValuesList==NULL) {
1104        numericValuesList=new MessagePatternDoubleList();
1105        if(numericValuesList==NULL) {
1106            errorCode=U_MEMORY_ALLOCATION_ERROR;
1107            return;
1108        }
1109    } else if(!numericValuesList->ensureCapacityForOneMore(numericValuesLength, errorCode)) {
1110        return;
1111    } else {
1112        if(numericIndex>Part::MAX_VALUE) {
1113            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1114            return;
1115        }
1116    }
1117    numericValuesList->a[numericValuesLength++]=numericValue;
1118    addPart(UMSGPAT_PART_TYPE_ARG_DOUBLE, start, length, numericIndex, errorCode);
1119}
1120
1121void
1122MessagePattern::setParseError(UParseError *parseError, int32_t index) {
1123    if(parseError==NULL) {
1124        return;
1125    }
1126    parseError->offset=index;
1127
1128    // Set preContext to some of msg before index.
1129    // Avoid splitting a surrogate pair.
1130    int32_t length=index;
1131    if(length>=U_PARSE_CONTEXT_LEN) {
1132        length=U_PARSE_CONTEXT_LEN-1;
1133        if(length>0 && U16_IS_TRAIL(msg[index-length])) {
1134            --length;
1135        }
1136    }
1137    msg.extract(index-length, length, parseError->preContext);
1138    parseError->preContext[length]=0;
1139
1140    // Set postContext to some of msg starting at index.
1141    length=msg.length()-index;
1142    if(length>=U_PARSE_CONTEXT_LEN) {
1143        length=U_PARSE_CONTEXT_LEN-1;
1144        if(length>0 && U16_IS_LEAD(msg[index+length-1])) {
1145            --length;
1146        }
1147    }
1148    msg.extract(index, length, parseError->postContext);
1149    parseError->postContext[length]=0;
1150}
1151
1152UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(MessagePattern)
1153
1154// MessageImpl ------------------------------------------------------------- ***
1155
1156void
1157MessageImpl::appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit,
1158                                      UnicodeString &sb) {
1159    int32_t doubleApos=-1;
1160    for(;;) {
1161        int32_t i=s.indexOf(u_apos, start);
1162        if(i<0 || i>=limit) {
1163            sb.append(s, start, limit-start);
1164            break;
1165        }
1166        if(i==doubleApos) {
1167            // Double apostrophe at start-1 and start==i, append one.
1168            sb.append(u_apos);
1169            ++start;
1170            doubleApos=-1;
1171        } else {
1172            // Append text between apostrophes and skip this one.
1173            sb.append(s, start, i-start);
1174            doubleApos=start=i+1;
1175        }
1176    }
1177}
1178
1179// Ported from second half of ICU4J SelectFormat.format(String).
1180UnicodeString &
1181MessageImpl::appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern,
1182                                               int32_t msgStart,
1183                                               UnicodeString &result) {
1184    const UnicodeString &msgString=msgPattern.getPatternString();
1185    int32_t prevIndex=msgPattern.getPart(msgStart).getLimit();
1186    for(int32_t i=msgStart;;) {
1187        const MessagePattern::Part &part=msgPattern.getPart(++i);
1188        UMessagePatternPartType type=part.getType();
1189        int32_t index=part.getIndex();
1190        if(type==UMSGPAT_PART_TYPE_MSG_LIMIT) {
1191            return result.append(msgString, prevIndex, index-prevIndex);
1192        } else if(type==UMSGPAT_PART_TYPE_SKIP_SYNTAX) {
1193            result.append(msgString, prevIndex, index-prevIndex);
1194            prevIndex=part.getLimit();
1195        } else if(type==UMSGPAT_PART_TYPE_ARG_START) {
1196            result.append(msgString, prevIndex, index-prevIndex);
1197            prevIndex=index;
1198            i=msgPattern.getLimitPartIndex(i);
1199            index=msgPattern.getPart(i).getLimit();
1200            appendReducedApostrophes(msgString, prevIndex, index, result);
1201            prevIndex=index;
1202        }
1203    }
1204}
1205
1206U_NAMESPACE_END
1207
1208#endif  // !UCONFIG_NO_FORMATTING
1209