1/*
2*******************************************************************************
3*   Copyright (C) 2011-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  messagepattern.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2011mar14
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_FORMATTING
18
19#include "unicode/messagepattern.h"
20#include "unicode/unistr.h"
21#include "unicode/utf16.h"
22#include "cmemory.h"
23#include "cstring.h"
24#include "messageimpl.h"
25#include "patternprops.h"
26#include "putilimp.h"
27#include "uassert.h"
28
29U_NAMESPACE_BEGIN
30
31// Unicode character/code point constants ---------------------------------- ***
32
33static const UChar u_pound=0x23;
34static const UChar u_apos=0x27;
35static const UChar u_plus=0x2B;
36static const UChar u_comma=0x2C;
37static const UChar u_minus=0x2D;
38static const UChar u_dot=0x2E;
39static const UChar u_colon=0x3A;
40static const UChar u_lessThan=0x3C;
41static const UChar u_equal=0x3D;
42static const UChar u_A=0x41;
43static const UChar u_C=0x43;
44static const UChar u_D=0x44;
45static const UChar u_E=0x45;
46static const UChar u_H=0x48;
47static const UChar u_I=0x49;
48static const UChar u_L=0x4C;
49static const UChar u_N=0x4E;
50static const UChar u_O=0x4F;
51static const UChar u_P=0x50;
52static const UChar u_R=0x52;
53static const UChar u_S=0x53;
54static const UChar u_T=0x54;
55static const UChar u_U=0x55;
56static const UChar u_Z=0x5A;
57static const UChar u_a=0x61;
58static const UChar u_c=0x63;
59static const UChar u_d=0x64;
60static const UChar u_e=0x65;
61static const UChar u_f=0x66;
62static const UChar u_h=0x68;
63static const UChar u_i=0x69;
64static const UChar u_l=0x6C;
65static const UChar u_n=0x6E;
66static const UChar u_o=0x6F;
67static const UChar u_p=0x70;
68static const UChar u_r=0x72;
69static const UChar u_s=0x73;
70static const UChar u_t=0x74;
71static const UChar u_u=0x75;
72static const UChar u_z=0x7A;
73static const UChar u_leftCurlyBrace=0x7B;
74static const UChar u_pipe=0x7C;
75static const UChar u_rightCurlyBrace=0x7D;
76static const UChar u_lessOrEqual=0x2264;  // U+2264 is <=
77
78static const UChar kOffsetColon[]={  // "offset:"
79    u_o, u_f, u_f, u_s, u_e, u_t, u_colon
80};
81
82static const UChar kOther[]={  // "other"
83    u_o, u_t, u_h, u_e, u_r
84};
85
86// MessagePatternList ------------------------------------------------------ ***
87
88template<typename T, int32_t stackCapacity>
89class MessagePatternList : public UMemory {
90public:
91    MessagePatternList() {}
92    void copyFrom(const MessagePatternList<T, stackCapacity> &other,
93                  int32_t length,
94                  UErrorCode &errorCode);
95    UBool ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode);
96    UBool equals(const MessagePatternList<T, stackCapacity> &other, int32_t length) const {
97        for(int32_t i=0; i<length; ++i) {
98            if(a[i]!=other.a[i]) { return FALSE; }
99        }
100        return TRUE;
101    }
102
103    MaybeStackArray<T, stackCapacity> a;
104};
105
106template<typename T, int32_t stackCapacity>
107void
108MessagePatternList<T, stackCapacity>::copyFrom(
109        const MessagePatternList<T, stackCapacity> &other,
110        int32_t length,
111        UErrorCode &errorCode) {
112    if(U_SUCCESS(errorCode) && length>0) {
113        if(length>a.getCapacity() && NULL==a.resize(length)) {
114            errorCode=U_MEMORY_ALLOCATION_ERROR;
115            return;
116        }
117        uprv_memcpy(a.getAlias(), other.a.getAlias(), length*sizeof(T));
118    }
119}
120
121template<typename T, int32_t stackCapacity>
122UBool
123MessagePatternList<T, stackCapacity>::ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode) {
124    if(U_FAILURE(errorCode)) {
125        return FALSE;
126    }
127    if(a.getCapacity()>oldLength || a.resize(2*oldLength, oldLength)!=NULL) {
128        return TRUE;
129    }
130    errorCode=U_MEMORY_ALLOCATION_ERROR;
131    return FALSE;
132}
133
134// MessagePatternList specializations -------------------------------------- ***
135
136class MessagePatternDoubleList : public MessagePatternList<double, 8> {
137};
138
139class MessagePatternPartsList : public MessagePatternList<MessagePattern::Part, 32> {
140};
141
142// MessagePattern constructors etc. ---------------------------------------- ***
143
144MessagePattern::MessagePattern(UErrorCode &errorCode)
145        : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE),
146          partsList(NULL), parts(NULL), partsLength(0),
147          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
148          hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) {
149    init(errorCode);
150}
151
152MessagePattern::MessagePattern(UMessagePatternApostropheMode mode, UErrorCode &errorCode)
153        : aposMode(mode),
154          partsList(NULL), parts(NULL), partsLength(0),
155          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
156          hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) {
157    init(errorCode);
158}
159
160MessagePattern::MessagePattern(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode)
161        : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE),
162          partsList(NULL), parts(NULL), partsLength(0),
163          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
164          hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) {
165    if(init(errorCode)) {
166        parse(pattern, parseError, errorCode);
167    }
168}
169
170UBool
171MessagePattern::init(UErrorCode &errorCode) {
172    if(U_FAILURE(errorCode)) {
173        return FALSE;
174    }
175    partsList=new MessagePatternPartsList();
176    if(partsList==NULL) {
177        errorCode=U_MEMORY_ALLOCATION_ERROR;
178        return FALSE;
179    }
180    parts=partsList->a.getAlias();
181    return TRUE;
182}
183
184MessagePattern::MessagePattern(const MessagePattern &other)
185        : UObject(other), aposMode(other.aposMode), msg(other.msg),
186          partsList(NULL), parts(NULL), partsLength(0),
187          numericValuesList(NULL), numericValues(NULL), numericValuesLength(0),
188          hasArgNames(other.hasArgNames), hasArgNumbers(other.hasArgNumbers),
189          needsAutoQuoting(other.needsAutoQuoting) {
190    UErrorCode errorCode=U_ZERO_ERROR;
191    if(!copyStorage(other, errorCode)) {
192        clear();
193    }
194}
195
196MessagePattern &
197MessagePattern::operator=(const MessagePattern &other) {
198    if(this==&other) {
199        return *this;
200    }
201    aposMode=other.aposMode;
202    msg=other.msg;
203    hasArgNames=other.hasArgNames;
204    hasArgNumbers=other.hasArgNumbers;
205    needsAutoQuoting=other.needsAutoQuoting;
206    UErrorCode errorCode=U_ZERO_ERROR;
207    if(!copyStorage(other, errorCode)) {
208        clear();
209    }
210    return *this;
211}
212
213UBool
214MessagePattern::copyStorage(const MessagePattern &other, UErrorCode &errorCode) {
215    if(U_FAILURE(errorCode)) {
216        return FALSE;
217    }
218    parts=NULL;
219    partsLength=0;
220    numericValues=NULL;
221    numericValuesLength=0;
222    if(partsList==NULL) {
223        partsList=new MessagePatternPartsList();
224        if(partsList==NULL) {
225            errorCode=U_MEMORY_ALLOCATION_ERROR;
226            return FALSE;
227        }
228        parts=partsList->a.getAlias();
229    }
230    if(other.partsLength>0) {
231        partsList->copyFrom(*other.partsList, other.partsLength, errorCode);
232        if(U_FAILURE(errorCode)) {
233            return FALSE;
234        }
235        parts=partsList->a.getAlias();
236        partsLength=other.partsLength;
237    }
238    if(other.numericValuesLength>0) {
239        if(numericValuesList==NULL) {
240            numericValuesList=new MessagePatternDoubleList();
241            if(numericValuesList==NULL) {
242                errorCode=U_MEMORY_ALLOCATION_ERROR;
243                return FALSE;
244            }
245            numericValues=numericValuesList->a.getAlias();
246        }
247        numericValuesList->copyFrom(
248            *other.numericValuesList, other.numericValuesLength, errorCode);
249        if(U_FAILURE(errorCode)) {
250            return FALSE;
251        }
252        numericValues=numericValuesList->a.getAlias();
253        numericValuesLength=other.numericValuesLength;
254    }
255    return TRUE;
256}
257
258MessagePattern::~MessagePattern() {
259    delete partsList;
260    delete numericValuesList;
261}
262
263// MessagePattern API ------------------------------------------------------ ***
264
265MessagePattern &
266MessagePattern::parse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) {
267    preParse(pattern, parseError, errorCode);
268    parseMessage(0, 0, 0, UMSGPAT_ARG_TYPE_NONE, parseError, errorCode);
269    postParse();
270    return *this;
271}
272
273MessagePattern &
274MessagePattern::parseChoiceStyle(const UnicodeString &pattern,
275                                 UParseError *parseError, UErrorCode &errorCode) {
276    preParse(pattern, parseError, errorCode);
277    parseChoiceStyle(0, 0, parseError, errorCode);
278    postParse();
279    return *this;
280}
281
282MessagePattern &
283MessagePattern::parsePluralStyle(const UnicodeString &pattern,
284                                 UParseError *parseError, UErrorCode &errorCode) {
285    preParse(pattern, parseError, errorCode);
286    parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_PLURAL, 0, 0, parseError, errorCode);
287    postParse();
288    return *this;
289}
290
291MessagePattern &
292MessagePattern::parseSelectStyle(const UnicodeString &pattern,
293                                 UParseError *parseError, UErrorCode &errorCode) {
294    preParse(pattern, parseError, errorCode);
295    parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_SELECT, 0, 0, parseError, errorCode);
296    postParse();
297    return *this;
298}
299
300void
301MessagePattern::clear() {
302    // Mostly the same as preParse().
303    msg.remove();
304    hasArgNames=hasArgNumbers=FALSE;
305    needsAutoQuoting=FALSE;
306    partsLength=0;
307    numericValuesLength=0;
308}
309
310UBool
311MessagePattern::operator==(const MessagePattern &other) const {
312    if(this==&other) {
313        return TRUE;
314    }
315    return
316        aposMode==other.aposMode &&
317        msg==other.msg &&
318        // parts.equals(o.parts)
319        partsLength==other.partsLength &&
320        (partsLength==0 || partsList->equals(*other.partsList, partsLength));
321    // No need to compare numericValues if msg and parts are the same.
322}
323
324int32_t
325MessagePattern::hashCode() const {
326    int32_t hash=(aposMode*37+msg.hashCode())*37+partsLength;
327    for(int32_t i=0; i<partsLength; ++i) {
328        hash=hash*37+parts[i].hashCode();
329    }
330    return hash;
331}
332
333int32_t
334MessagePattern::validateArgumentName(const UnicodeString &name) {
335    if(!PatternProps::isIdentifier(name.getBuffer(), name.length())) {
336        return UMSGPAT_ARG_NAME_NOT_VALID;
337    }
338    return parseArgNumber(name, 0, name.length());
339}
340
341UnicodeString
342MessagePattern::autoQuoteApostropheDeep() const {
343    if(!needsAutoQuoting) {
344        return msg;
345    }
346    UnicodeString modified(msg);
347    // Iterate backward so that the insertion indexes do not change.
348    int32_t count=countParts();
349    for(int32_t i=count; i>0;) {
350        const Part &part=getPart(--i);
351        if(part.getType()==UMSGPAT_PART_TYPE_INSERT_CHAR) {
352           modified.insert(part.index, (UChar)part.value);
353        }
354    }
355    return modified;
356}
357
358double
359MessagePattern::getNumericValue(const Part &part) const {
360    UMessagePatternPartType type=part.type;
361    if(type==UMSGPAT_PART_TYPE_ARG_INT) {
362        return part.value;
363    } else if(type==UMSGPAT_PART_TYPE_ARG_DOUBLE) {
364        return numericValues[part.value];
365    } else {
366        return UMSGPAT_NO_NUMERIC_VALUE;
367    }
368}
369
370/**
371  * Returns the "offset:" value of a PluralFormat argument, or 0 if none is specified.
372  * @param pluralStart the index of the first PluralFormat argument style part. (0..countParts()-1)
373  * @return the "offset:" value.
374  * @draft ICU 4.8
375  */
376double
377MessagePattern::getPluralOffset(int32_t pluralStart) const {
378    const Part &part=getPart(pluralStart);
379    if(Part::hasNumericValue(part.type)) {
380        return getNumericValue(part);
381    } else {
382        return 0;
383    }
384}
385
386// MessagePattern::Part ---------------------------------------------------- ***
387
388UBool
389MessagePattern::Part::operator==(const Part &other) const {
390    if(this==&other) {
391        return TRUE;
392    }
393    return
394        type==other.type &&
395        index==other.index &&
396        length==other.length &&
397        value==other.value &&
398        limitPartIndex==other.limitPartIndex;
399}
400
401// MessagePattern parser --------------------------------------------------- ***
402
403void
404MessagePattern::preParse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) {
405    if(U_FAILURE(errorCode)) {
406        return;
407    }
408    if(parseError!=NULL) {
409        parseError->line=0;
410        parseError->offset=0;
411        parseError->preContext[0]=0;
412        parseError->postContext[0]=0;
413    }
414    msg=pattern;
415    hasArgNames=hasArgNumbers=FALSE;
416    needsAutoQuoting=FALSE;
417    partsLength=0;
418    numericValuesLength=0;
419}
420
421void
422MessagePattern::postParse() {
423    if(partsList!=NULL) {
424        parts=partsList->a.getAlias();
425    }
426    if(numericValuesList!=NULL) {
427        numericValues=numericValuesList->a.getAlias();
428    }
429}
430
431int32_t
432MessagePattern::parseMessage(int32_t index, int32_t msgStartLength,
433                             int32_t nestingLevel, UMessagePatternArgType parentType,
434                             UParseError *parseError, UErrorCode &errorCode) {
435    if(U_FAILURE(errorCode)) {
436        return 0;
437    }
438    if(nestingLevel>Part::MAX_VALUE) {
439        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
440        return 0;
441    }
442    int32_t msgStart=partsLength;
443    addPart(UMSGPAT_PART_TYPE_MSG_START, index, msgStartLength, nestingLevel, errorCode);
444    index+=msgStartLength;
445    for(;;) {  // while(index<msg.length()) with U_FAILURE(errorCode) check
446        if(U_FAILURE(errorCode)) {
447            return 0;
448        }
449        if(index>=msg.length()) {
450            break;
451        }
452        UChar c=msg.charAt(index++);
453        if(c==u_apos) {
454            if(index==msg.length()) {
455                // The apostrophe is the last character in the pattern.
456                // Add a Part for auto-quoting.
457                addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0,
458                        u_apos, errorCode);  // value=char to be inserted
459                needsAutoQuoting=TRUE;
460            } else {
461                c=msg.charAt(index);
462                if(c==u_apos) {
463                    // double apostrophe, skip the second one
464                    addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode);
465                } else if(
466                    aposMode==UMSGPAT_APOS_DOUBLE_REQUIRED ||
467                    c==u_leftCurlyBrace || c==u_rightCurlyBrace ||
468                    (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe) ||
469                    (UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE(parentType) && c==u_pound)
470                ) {
471                    // skip the quote-starting apostrophe
472                    addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index-1, 1, 0, errorCode);
473                    // find the end of the quoted literal text
474                    for(;;) {
475                        index=msg.indexOf(u_apos, index+1);
476                        if(index>=0) {
477                            if(/*(index+1)<msg.length() &&*/ msg.charAt(index+1)==u_apos) {
478                                // double apostrophe inside quoted literal text
479                                // still encodes a single apostrophe, skip the second one
480                                addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, ++index, 1, 0, errorCode);
481                            } else {
482                                // skip the quote-ending apostrophe
483                                addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode);
484                                break;
485                            }
486                        } else {
487                            // The quoted text reaches to the end of the of the message.
488                            index=msg.length();
489                            // Add a Part for auto-quoting.
490                            addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0,
491                                    u_apos, errorCode);  // value=char to be inserted
492                            needsAutoQuoting=TRUE;
493                            break;
494                        }
495                    }
496                } else {
497                    // Interpret the apostrophe as literal text.
498                    // Add a Part for auto-quoting.
499                    addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0,
500                            u_apos, errorCode);  // value=char to be inserted
501                    needsAutoQuoting=TRUE;
502                }
503            }
504        } else if(UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE(parentType) && c==u_pound) {
505            // The unquoted # in a plural message fragment will be replaced
506            // with the (number-offset).
507            addPart(UMSGPAT_PART_TYPE_REPLACE_NUMBER, index-1, 1, 0, errorCode);
508        } else if(c==u_leftCurlyBrace) {
509            index=parseArg(index-1, 1, nestingLevel, parseError, errorCode);
510        } else if((nestingLevel>0 && c==u_rightCurlyBrace) ||
511                  (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe)) {
512            // Finish the message before the terminator.
513            // In a choice style, report the "}" substring only for the following ARG_LIMIT,
514            // not for this MSG_LIMIT.
515            int32_t limitLength=(parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_rightCurlyBrace) ? 0 : 1;
516            addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index-1, limitLength,
517                         nestingLevel, errorCode);
518            if(parentType==UMSGPAT_ARG_TYPE_CHOICE) {
519                // Let the choice style parser see the '}' or '|'.
520                return index-1;
521            } else {
522                // continue parsing after the '}'
523                return index;
524            }
525        }  // else: c is part of literal text
526    }
527    if(nestingLevel>0 && !inTopLevelChoiceMessage(nestingLevel, parentType)) {
528        setParseError(parseError, 0);  // Unmatched '{' braces in message.
529        errorCode=U_UNMATCHED_BRACES;
530        return 0;
531    }
532    addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index, 0, nestingLevel, errorCode);
533    return index;
534}
535
536int32_t
537MessagePattern::parseArg(int32_t index, int32_t argStartLength, int32_t nestingLevel,
538                         UParseError *parseError, UErrorCode &errorCode) {
539    int32_t argStart=partsLength;
540    UMessagePatternArgType argType=UMSGPAT_ARG_TYPE_NONE;
541    addPart(UMSGPAT_PART_TYPE_ARG_START, index, argStartLength, argType, errorCode);
542    if(U_FAILURE(errorCode)) {
543        return 0;
544    }
545    int32_t nameIndex=index=skipWhiteSpace(index+argStartLength);
546    if(index==msg.length()) {
547        setParseError(parseError, 0);  // Unmatched '{' braces in message.
548        errorCode=U_UNMATCHED_BRACES;
549        return 0;
550    }
551    // parse argument name or number
552    index=skipIdentifier(index);
553    int32_t number=parseArgNumber(nameIndex, index);
554    if(number>=0) {
555        int32_t length=index-nameIndex;
556        if(length>Part::MAX_LENGTH || number>Part::MAX_VALUE) {
557            setParseError(parseError, nameIndex);  // Argument number too large.
558            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
559            return 0;
560        }
561        hasArgNumbers=TRUE;
562        addPart(UMSGPAT_PART_TYPE_ARG_NUMBER, nameIndex, length, number, errorCode);
563    } else if(number==UMSGPAT_ARG_NAME_NOT_NUMBER) {
564        int32_t length=index-nameIndex;
565        if(length>Part::MAX_LENGTH) {
566            setParseError(parseError, nameIndex);  // Argument name too long.
567            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
568            return 0;
569        }
570        hasArgNames=TRUE;
571        addPart(UMSGPAT_PART_TYPE_ARG_NAME, nameIndex, length, 0, errorCode);
572    } else {  // number<-1 (ARG_NAME_NOT_VALID)
573        setParseError(parseError, nameIndex);  // Bad argument syntax.
574        errorCode=U_PATTERN_SYNTAX_ERROR;
575        return 0;
576    }
577    index=skipWhiteSpace(index);
578    if(index==msg.length()) {
579        setParseError(parseError, 0);  // Unmatched '{' braces in message.
580        errorCode=U_UNMATCHED_BRACES;
581        return 0;
582    }
583    UChar c=msg.charAt(index);
584    if(c==u_rightCurlyBrace) {
585        // all done
586    } else if(c!=u_comma) {
587        setParseError(parseError, nameIndex);  // Bad argument syntax.
588        errorCode=U_PATTERN_SYNTAX_ERROR;
589        return 0;
590    } else /* ',' */ {
591        // parse argument type: case-sensitive a-zA-Z
592        int32_t typeIndex=index=skipWhiteSpace(index+1);
593        while(index<msg.length() && isArgTypeChar(msg.charAt(index))) {
594            ++index;
595        }
596        int32_t length=index-typeIndex;
597        index=skipWhiteSpace(index);
598        if(index==msg.length()) {
599            setParseError(parseError, 0);  // Unmatched '{' braces in message.
600            errorCode=U_UNMATCHED_BRACES;
601            return 0;
602        }
603        if(length==0 || ((c=msg.charAt(index))!=u_comma && c!=u_rightCurlyBrace)) {
604            setParseError(parseError, nameIndex);  // Bad argument syntax.
605            errorCode=U_PATTERN_SYNTAX_ERROR;
606            return 0;
607        }
608        if(length>Part::MAX_LENGTH) {
609            setParseError(parseError, nameIndex);  // Argument type name too long.
610            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
611            return 0;
612        }
613        argType=UMSGPAT_ARG_TYPE_SIMPLE;
614        if(length==6) {
615            // case-insensitive comparisons for complex-type names
616            if(isChoice(typeIndex)) {
617                argType=UMSGPAT_ARG_TYPE_CHOICE;
618            } else if(isPlural(typeIndex)) {
619                argType=UMSGPAT_ARG_TYPE_PLURAL;
620            } else if(isSelect(typeIndex)) {
621                argType=UMSGPAT_ARG_TYPE_SELECT;
622            }
623        } else if(length==13) {
624            if(isSelect(typeIndex) && isOrdinal(typeIndex+6)) {
625                argType=UMSGPAT_ARG_TYPE_SELECTORDINAL;
626            }
627        }
628        // change the ARG_START type from NONE to argType
629        partsList->a[argStart].value=(int16_t)argType;
630        if(argType==UMSGPAT_ARG_TYPE_SIMPLE) {
631            addPart(UMSGPAT_PART_TYPE_ARG_TYPE, typeIndex, length, 0, errorCode);
632        }
633        // look for an argument style (pattern)
634        if(c==u_rightCurlyBrace) {
635            if(argType!=UMSGPAT_ARG_TYPE_SIMPLE) {
636                setParseError(parseError, nameIndex);  // No style field for complex argument.
637                errorCode=U_PATTERN_SYNTAX_ERROR;
638                return 0;
639            }
640        } else /* ',' */ {
641            ++index;
642            if(argType==UMSGPAT_ARG_TYPE_SIMPLE) {
643                index=parseSimpleStyle(index, parseError, errorCode);
644            } else if(argType==UMSGPAT_ARG_TYPE_CHOICE) {
645                index=parseChoiceStyle(index, nestingLevel, parseError, errorCode);
646            } else {
647                index=parsePluralOrSelectStyle(argType, index, nestingLevel, parseError, errorCode);
648            }
649        }
650    }
651    // Argument parsing stopped on the '}'.
652    addLimitPart(argStart, UMSGPAT_PART_TYPE_ARG_LIMIT, index, 1, argType, errorCode);
653    return index+1;
654}
655
656int32_t
657MessagePattern::parseSimpleStyle(int32_t index, UParseError *parseError, UErrorCode &errorCode) {
658    if(U_FAILURE(errorCode)) {
659        return 0;
660    }
661    int32_t start=index;
662    int32_t nestedBraces=0;
663    while(index<msg.length()) {
664        UChar c=msg.charAt(index++);
665        if(c==u_apos) {
666            // Treat apostrophe as quoting but include it in the style part.
667            // Find the end of the quoted literal text.
668            index=msg.indexOf(u_apos, index);
669            if(index<0) {
670                // Quoted literal argument style text reaches to the end of the message.
671                setParseError(parseError, start);
672                errorCode=U_PATTERN_SYNTAX_ERROR;
673                return 0;
674            }
675            // skip the quote-ending apostrophe
676            ++index;
677        } else if(c==u_leftCurlyBrace) {
678            ++nestedBraces;
679        } else if(c==u_rightCurlyBrace) {
680            if(nestedBraces>0) {
681                --nestedBraces;
682            } else {
683                int32_t length=--index-start;
684                if(length>Part::MAX_LENGTH) {
685                    setParseError(parseError, start);  // Argument style text too long.
686                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
687                    return 0;
688                }
689                addPart(UMSGPAT_PART_TYPE_ARG_STYLE, start, length, 0, errorCode);
690                return index;
691            }
692        }  // c is part of literal text
693    }
694    setParseError(parseError, 0);  // Unmatched '{' braces in message.
695    errorCode=U_UNMATCHED_BRACES;
696    return 0;
697}
698
699int32_t
700MessagePattern::parseChoiceStyle(int32_t index, int32_t nestingLevel,
701                                 UParseError *parseError, UErrorCode &errorCode) {
702    if(U_FAILURE(errorCode)) {
703        return 0;
704    }
705    int32_t start=index;
706    index=skipWhiteSpace(index);
707    if(index==msg.length() || msg.charAt(index)==u_rightCurlyBrace) {
708        setParseError(parseError, 0);  // Missing choice argument pattern.
709        errorCode=U_PATTERN_SYNTAX_ERROR;
710        return 0;
711    }
712    for(;;) {
713        // The choice argument style contains |-separated (number, separator, message) triples.
714        // Parse the number.
715        int32_t numberIndex=index;
716        index=skipDouble(index);
717        int32_t length=index-numberIndex;
718        if(length==0) {
719            setParseError(parseError, start);  // Bad choice pattern syntax.
720            errorCode=U_PATTERN_SYNTAX_ERROR;
721            return 0;
722        }
723        if(length>Part::MAX_LENGTH) {
724            setParseError(parseError, numberIndex);  // Choice number too long.
725            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
726            return 0;
727        }
728        parseDouble(numberIndex, index, TRUE, parseError, errorCode);  // adds ARG_INT or ARG_DOUBLE
729        if(U_FAILURE(errorCode)) {
730            return 0;
731        }
732        // Parse the separator.
733        index=skipWhiteSpace(index);
734        if(index==msg.length()) {
735            setParseError(parseError, start);  // Bad choice pattern syntax.
736            errorCode=U_PATTERN_SYNTAX_ERROR;
737            return 0;
738        }
739        UChar c=msg.charAt(index);
740        if(!(c==u_pound || c==u_lessThan || c==u_lessOrEqual)) {  // U+2264 is <=
741            setParseError(parseError, start);  // Expected choice separator (#<\u2264) instead of c.
742            errorCode=U_PATTERN_SYNTAX_ERROR;
743            return 0;
744        }
745        addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, index, 1, 0, errorCode);
746        // Parse the message fragment.
747        index=parseMessage(++index, 0, nestingLevel+1, UMSGPAT_ARG_TYPE_CHOICE, parseError, errorCode);
748        if(U_FAILURE(errorCode)) {
749            return 0;
750        }
751        // parseMessage(..., CHOICE) returns the index of the terminator, or msg.length().
752        if(index==msg.length()) {
753            return index;
754        }
755        if(msg.charAt(index)==u_rightCurlyBrace) {
756            if(!inMessageFormatPattern(nestingLevel)) {
757                setParseError(parseError, start);  // Bad choice pattern syntax.
758                errorCode=U_PATTERN_SYNTAX_ERROR;
759                return 0;
760            }
761            return index;
762        }  // else the terminator is '|'
763        index=skipWhiteSpace(index+1);
764    }
765}
766
767int32_t
768MessagePattern::parsePluralOrSelectStyle(UMessagePatternArgType argType,
769                                         int32_t index, int32_t nestingLevel,
770                                         UParseError *parseError, UErrorCode &errorCode) {
771    if(U_FAILURE(errorCode)) {
772        return 0;
773    }
774    int32_t start=index;
775    UBool isEmpty=TRUE;
776    UBool hasOther=FALSE;
777    for(;;) {
778        // First, collect the selector looking for a small set of terminators.
779        // It would be a little faster to consider the syntax of each possible
780        // token right here, but that makes the code too complicated.
781        index=skipWhiteSpace(index);
782        UBool eos=index==msg.length();
783        if(eos || msg.charAt(index)==u_rightCurlyBrace) {
784            if(eos==inMessageFormatPattern(nestingLevel)) {
785                setParseError(parseError, start);  // Bad plural/select pattern syntax.
786                errorCode=U_PATTERN_SYNTAX_ERROR;
787                return 0;
788            }
789            if(!hasOther) {
790                setParseError(parseError, 0);  // Missing 'other' keyword in plural/select pattern.
791                errorCode=U_DEFAULT_KEYWORD_MISSING;
792                return 0;
793            }
794            return index;
795        }
796        int32_t selectorIndex=index;
797        if(UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE(argType) && msg.charAt(selectorIndex)==u_equal) {
798            // explicit-value plural selector: =double
799            index=skipDouble(index+1);
800            int32_t length=index-selectorIndex;
801            if(length==1) {
802                setParseError(parseError, start);  // Bad plural/select pattern syntax.
803                errorCode=U_PATTERN_SYNTAX_ERROR;
804                return 0;
805            }
806            if(length>Part::MAX_LENGTH) {
807                setParseError(parseError, selectorIndex);  // Argument selector too long.
808                errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
809                return 0;
810            }
811            addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode);
812            parseDouble(selectorIndex+1, index, FALSE,
813                        parseError, errorCode);  // adds ARG_INT or ARG_DOUBLE
814        } else {
815            index=skipIdentifier(index);
816            int32_t length=index-selectorIndex;
817            if(length==0) {
818                setParseError(parseError, start);  // Bad plural/select pattern syntax.
819                errorCode=U_PATTERN_SYNTAX_ERROR;
820                return 0;
821            }
822            // Note: The ':' in "offset:" is just beyond the skipIdentifier() range.
823            if( UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE(argType) && length==6 && index<msg.length() &&
824                0==msg.compare(selectorIndex, 7, kOffsetColon, 0, 7)
825            ) {
826                // plural offset, not a selector
827                if(!isEmpty) {
828                    // Plural argument 'offset:' (if present) must precede key-message pairs.
829                    setParseError(parseError, start);
830                    errorCode=U_PATTERN_SYNTAX_ERROR;
831                    return 0;
832                }
833                // allow whitespace between offset: and its value
834                int32_t valueIndex=skipWhiteSpace(index+1);  // The ':' is at index.
835                index=skipDouble(valueIndex);
836                if(index==valueIndex) {
837                    setParseError(parseError, start);  // Missing value for plural 'offset:'.
838                    errorCode=U_PATTERN_SYNTAX_ERROR;
839                    return 0;
840                }
841                if((index-valueIndex)>Part::MAX_LENGTH) {
842                    setParseError(parseError, valueIndex);  // Plural offset value too long.
843                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
844                    return 0;
845                }
846                parseDouble(valueIndex, index, FALSE,
847                            parseError, errorCode);  // adds ARG_INT or ARG_DOUBLE
848                if(U_FAILURE(errorCode)) {
849                    return 0;
850                }
851                isEmpty=FALSE;
852                continue;  // no message fragment after the offset
853            } else {
854                // normal selector word
855                if(length>Part::MAX_LENGTH) {
856                    setParseError(parseError, selectorIndex);  // Argument selector too long.
857                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
858                    return 0;
859                }
860                addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode);
861                if(0==msg.compare(selectorIndex, length, kOther, 0, 5)) {
862                    hasOther=TRUE;
863                }
864            }
865        }
866        if(U_FAILURE(errorCode)) {
867            return 0;
868        }
869
870        // parse the message fragment following the selector
871        index=skipWhiteSpace(index);
872        if(index==msg.length() || msg.charAt(index)!=u_leftCurlyBrace) {
873            setParseError(parseError, selectorIndex);  // No message fragment after plural/select selector.
874            errorCode=U_PATTERN_SYNTAX_ERROR;
875            return 0;
876        }
877        index=parseMessage(index, 1, nestingLevel+1, argType, parseError, errorCode);
878        if(U_FAILURE(errorCode)) {
879            return 0;
880        }
881        isEmpty=FALSE;
882    }
883}
884
885int32_t
886MessagePattern::parseArgNumber(const UnicodeString &s, int32_t start, int32_t limit) {
887    // If the identifier contains only ASCII digits, then it is an argument _number_
888    // and must not have leading zeros (except "0" itself).
889    // Otherwise it is an argument _name_.
890    if(start>=limit) {
891        return UMSGPAT_ARG_NAME_NOT_VALID;
892    }
893    int32_t number;
894    // Defer numeric errors until we know there are only digits.
895    UBool badNumber;
896    UChar c=s.charAt(start++);
897    if(c==0x30) {
898        if(start==limit) {
899            return 0;
900        } else {
901            number=0;
902            badNumber=TRUE;  // leading zero
903        }
904    } else if(0x31<=c && c<=0x39) {
905        number=c-0x30;
906        badNumber=FALSE;
907    } else {
908        return UMSGPAT_ARG_NAME_NOT_NUMBER;
909    }
910    while(start<limit) {
911        c=s.charAt(start++);
912        if(0x30<=c && c<=0x39) {
913            if(number>=INT32_MAX/10) {
914                badNumber=TRUE;  // overflow
915            }
916            number=number*10+(c-0x30);
917        } else {
918            return UMSGPAT_ARG_NAME_NOT_NUMBER;
919        }
920    }
921    // There are only ASCII digits.
922    if(badNumber) {
923        return UMSGPAT_ARG_NAME_NOT_VALID;
924    } else {
925        return number;
926    }
927}
928
929void
930MessagePattern::parseDouble(int32_t start, int32_t limit, UBool allowInfinity,
931                            UParseError *parseError, UErrorCode &errorCode) {
932    if(U_FAILURE(errorCode)) {
933        return;
934    }
935    U_ASSERT(start<limit);
936    // fake loop for easy exit and single throw statement
937    for(;;) { /*loop doesn't iterate*/
938        // fast path for small integers and infinity
939        int32_t value=0;
940        int32_t isNegative=0;  // not boolean so that we can easily add it to value
941        int32_t index=start;
942        UChar c=msg.charAt(index++);
943        if(c==u_minus) {
944            isNegative=1;
945            if(index==limit) {
946                break;  // no number
947            }
948            c=msg.charAt(index++);
949        } else if(c==u_plus) {
950            if(index==limit) {
951                break;  // no number
952            }
953            c=msg.charAt(index++);
954        }
955        if(c==0x221e) {  // infinity
956            if(allowInfinity && index==limit) {
957                double infinity=uprv_getInfinity();
958                addArgDoublePart(
959                    isNegative!=0 ? -infinity : infinity,
960                    start, limit-start, errorCode);
961                return;
962            } else {
963                break;
964            }
965        }
966        // try to parse the number as a small integer but fall back to a double
967        while('0'<=c && c<='9') {
968            value=value*10+(c-'0');
969            if(value>(Part::MAX_VALUE+isNegative)) {
970                break;  // not a small-enough integer
971            }
972            if(index==limit) {
973                addPart(UMSGPAT_PART_TYPE_ARG_INT, start, limit-start,
974                        isNegative!=0 ? -value : value, errorCode);
975                return;
976            }
977            c=msg.charAt(index++);
978        }
979        // Let Double.parseDouble() throw a NumberFormatException.
980        char numberChars[128];
981        int32_t capacity=(int32_t)sizeof(numberChars);
982        int32_t length=limit-start;
983        if(length>=capacity) {
984            break;  // number too long
985        }
986        msg.extract(start, length, numberChars, capacity, US_INV);
987        if((int32_t)uprv_strlen(numberChars)<length) {
988            break;  // contains non-invariant character that was turned into NUL
989        }
990        char *end;
991        double numericValue=uprv_strtod(numberChars, &end);
992        if(end!=(numberChars+length)) {
993            break;  // parsing error
994        }
995        addArgDoublePart(numericValue, start, length, errorCode);
996        return;
997    }
998    setParseError(parseError, start /*, limit*/);  // Bad syntax for numeric value.
999    errorCode=U_PATTERN_SYNTAX_ERROR;
1000    return;
1001}
1002
1003int32_t
1004MessagePattern::skipWhiteSpace(int32_t index) {
1005    const UChar *s=msg.getBuffer();
1006    int32_t msgLength=msg.length();
1007    const UChar *t=PatternProps::skipWhiteSpace(s+index, msgLength-index);
1008    return (int32_t)(t-s);
1009}
1010
1011int32_t
1012MessagePattern::skipIdentifier(int32_t index) {
1013    const UChar *s=msg.getBuffer();
1014    int32_t msgLength=msg.length();
1015    const UChar *t=PatternProps::skipIdentifier(s+index, msgLength-index);
1016    return (int32_t)(t-s);
1017}
1018
1019int32_t
1020MessagePattern::skipDouble(int32_t index) {
1021    int32_t msgLength=msg.length();
1022    while(index<msgLength) {
1023        UChar c=msg.charAt(index);
1024        // U+221E: Allow the infinity symbol, for ChoiceFormat patterns.
1025        if((c<0x30 && c!=u_plus && c!=u_minus && c!=u_dot) || (c>0x39 && c!=u_e && c!=u_E && c!=0x221e)) {
1026            break;
1027        }
1028        ++index;
1029    }
1030    return index;
1031}
1032
1033UBool
1034MessagePattern::isArgTypeChar(UChar32 c) {
1035    return (u_a<=c && c<=u_z) || (u_A<=c && c<=u_Z);
1036}
1037
1038UBool
1039MessagePattern::isChoice(int32_t index) {
1040    UChar c;
1041    return
1042        ((c=msg.charAt(index++))==u_c || c==u_C) &&
1043        ((c=msg.charAt(index++))==u_h || c==u_H) &&
1044        ((c=msg.charAt(index++))==u_o || c==u_O) &&
1045        ((c=msg.charAt(index++))==u_i || c==u_I) &&
1046        ((c=msg.charAt(index++))==u_c || c==u_C) &&
1047        ((c=msg.charAt(index))==u_e || c==u_E);
1048}
1049
1050UBool
1051MessagePattern::isPlural(int32_t index) {
1052    UChar c;
1053    return
1054        ((c=msg.charAt(index++))==u_p || c==u_P) &&
1055        ((c=msg.charAt(index++))==u_l || c==u_L) &&
1056        ((c=msg.charAt(index++))==u_u || c==u_U) &&
1057        ((c=msg.charAt(index++))==u_r || c==u_R) &&
1058        ((c=msg.charAt(index++))==u_a || c==u_A) &&
1059        ((c=msg.charAt(index))==u_l || c==u_L);
1060}
1061
1062UBool
1063MessagePattern::isSelect(int32_t index) {
1064    UChar c;
1065    return
1066        ((c=msg.charAt(index++))==u_s || c==u_S) &&
1067        ((c=msg.charAt(index++))==u_e || c==u_E) &&
1068        ((c=msg.charAt(index++))==u_l || c==u_L) &&
1069        ((c=msg.charAt(index++))==u_e || c==u_E) &&
1070        ((c=msg.charAt(index++))==u_c || c==u_C) &&
1071        ((c=msg.charAt(index))==u_t || c==u_T);
1072}
1073
1074UBool
1075MessagePattern::isOrdinal(int32_t index) {
1076    UChar c;
1077    return
1078        ((c=msg.charAt(index++))==u_o || c==u_O) &&
1079        ((c=msg.charAt(index++))==u_r || c==u_R) &&
1080        ((c=msg.charAt(index++))==u_d || c==u_D) &&
1081        ((c=msg.charAt(index++))==u_i || c==u_I) &&
1082        ((c=msg.charAt(index++))==u_n || c==u_N) &&
1083        ((c=msg.charAt(index++))==u_a || c==u_A) &&
1084        ((c=msg.charAt(index))==u_l || c==u_L);
1085}
1086
1087UBool
1088MessagePattern::inMessageFormatPattern(int32_t nestingLevel) {
1089    return nestingLevel>0 || partsList->a[0].type==UMSGPAT_PART_TYPE_MSG_START;
1090}
1091
1092UBool
1093MessagePattern::inTopLevelChoiceMessage(int32_t nestingLevel, UMessagePatternArgType parentType) {
1094    return
1095        nestingLevel==1 &&
1096        parentType==UMSGPAT_ARG_TYPE_CHOICE &&
1097        partsList->a[0].type!=UMSGPAT_PART_TYPE_MSG_START;
1098}
1099
1100void
1101MessagePattern::addPart(UMessagePatternPartType type, int32_t index, int32_t length,
1102                        int32_t value, UErrorCode &errorCode) {
1103    if(partsList->ensureCapacityForOneMore(partsLength, errorCode)) {
1104        Part &part=partsList->a[partsLength++];
1105        part.type=type;
1106        part.index=index;
1107        part.length=(uint16_t)length;
1108        part.value=(int16_t)value;
1109        part.limitPartIndex=0;
1110    }
1111}
1112
1113void
1114MessagePattern::addLimitPart(int32_t start,
1115                             UMessagePatternPartType type, int32_t index, int32_t length,
1116                             int32_t value, UErrorCode &errorCode) {
1117    partsList->a[start].limitPartIndex=partsLength;
1118    addPart(type, index, length, value, errorCode);
1119}
1120
1121void
1122MessagePattern::addArgDoublePart(double numericValue, int32_t start, int32_t length,
1123                                 UErrorCode &errorCode) {
1124    if(U_FAILURE(errorCode)) {
1125        return;
1126    }
1127    int32_t numericIndex=numericValuesLength;
1128    if(numericValuesList==NULL) {
1129        numericValuesList=new MessagePatternDoubleList();
1130        if(numericValuesList==NULL) {
1131            errorCode=U_MEMORY_ALLOCATION_ERROR;
1132            return;
1133        }
1134    } else if(!numericValuesList->ensureCapacityForOneMore(numericValuesLength, errorCode)) {
1135        return;
1136    } else {
1137        if(numericIndex>Part::MAX_VALUE) {
1138            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1139            return;
1140        }
1141    }
1142    numericValuesList->a[numericValuesLength++]=numericValue;
1143    addPart(UMSGPAT_PART_TYPE_ARG_DOUBLE, start, length, numericIndex, errorCode);
1144}
1145
1146void
1147MessagePattern::setParseError(UParseError *parseError, int32_t index) {
1148    if(parseError==NULL) {
1149        return;
1150    }
1151    parseError->offset=index;
1152
1153    // Set preContext to some of msg before index.
1154    // Avoid splitting a surrogate pair.
1155    int32_t length=index;
1156    if(length>=U_PARSE_CONTEXT_LEN) {
1157        length=U_PARSE_CONTEXT_LEN-1;
1158        if(length>0 && U16_IS_TRAIL(msg[index-length])) {
1159            --length;
1160        }
1161    }
1162    msg.extract(index-length, length, parseError->preContext);
1163    parseError->preContext[length]=0;
1164
1165    // Set postContext to some of msg starting at index.
1166    length=msg.length()-index;
1167    if(length>=U_PARSE_CONTEXT_LEN) {
1168        length=U_PARSE_CONTEXT_LEN-1;
1169        if(length>0 && U16_IS_LEAD(msg[index+length-1])) {
1170            --length;
1171        }
1172    }
1173    msg.extract(index, length, parseError->postContext);
1174    parseError->postContext[length]=0;
1175}
1176
1177// MessageImpl ------------------------------------------------------------- ***
1178
1179void
1180MessageImpl::appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit,
1181                                      UnicodeString &sb) {
1182    int32_t doubleApos=-1;
1183    for(;;) {
1184        int32_t i=s.indexOf(u_apos, start);
1185        if(i<0 || i>=limit) {
1186            sb.append(s, start, limit-start);
1187            break;
1188        }
1189        if(i==doubleApos) {
1190            // Double apostrophe at start-1 and start==i, append one.
1191            sb.append(u_apos);
1192            ++start;
1193            doubleApos=-1;
1194        } else {
1195            // Append text between apostrophes and skip this one.
1196            sb.append(s, start, i-start);
1197            doubleApos=start=i+1;
1198        }
1199    }
1200}
1201
1202// Ported from second half of ICU4J SelectFormat.format(String).
1203UnicodeString &
1204MessageImpl::appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern,
1205                                               int32_t msgStart,
1206                                               UnicodeString &result) {
1207    const UnicodeString &msgString=msgPattern.getPatternString();
1208    int32_t prevIndex=msgPattern.getPart(msgStart).getLimit();
1209    for(int32_t i=msgStart;;) {
1210        const MessagePattern::Part &part=msgPattern.getPart(++i);
1211        UMessagePatternPartType type=part.getType();
1212        int32_t index=part.getIndex();
1213        if(type==UMSGPAT_PART_TYPE_MSG_LIMIT) {
1214            return result.append(msgString, prevIndex, index-prevIndex);
1215        } else if(type==UMSGPAT_PART_TYPE_SKIP_SYNTAX) {
1216            result.append(msgString, prevIndex, index-prevIndex);
1217            prevIndex=part.getLimit();
1218        } else if(type==UMSGPAT_PART_TYPE_ARG_START) {
1219            result.append(msgString, prevIndex, index-prevIndex);
1220            prevIndex=index;
1221            i=msgPattern.getLimitPartIndex(i);
1222            index=msgPattern.getPart(i).getLimit();
1223            appendReducedApostrophes(msgString, prevIndex, index, result);
1224            prevIndex=index;
1225        }
1226    }
1227}
1228
1229U_NAMESPACE_END
1230
1231#endif  // !UCONFIG_NO_FORMATTING
1232