1/*
2**********************************************************************
3*   Copyright (c) 2001-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7package com.ibm.icu.text;
8
9import java.text.ParsePosition;
10import java.util.ArrayList;
11import java.util.HashMap;
12import java.util.List;
13import java.util.Map;
14
15import com.ibm.icu.impl.IllegalIcuArgumentException;
16import com.ibm.icu.impl.PatternProps;
17import com.ibm.icu.impl.Utility;
18import com.ibm.icu.lang.UCharacter;
19import com.ibm.icu.text.RuleBasedTransliterator.Data;
20
21class TransliteratorParser {
22
23    //----------------------------------------------------------------------
24    // Data members
25    //----------------------------------------------------------------------
26
27    /**
28     * PUBLIC data member.
29     * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
30     * of rules in the rule set
31     */
32    public List<Data> dataVector;
33
34    /**
35     * PUBLIC data member.
36     * A Vector of Strings containing all of the ID blocks in the rule set
37     */
38    public List<String> idBlockVector;
39
40    /**
41     * The current data object for which we are parsing rules
42     */
43    private Data curData;
44
45    /**
46     * PUBLIC data member containing the parsed compound filter, if any.
47     */
48    public UnicodeSet compoundFilter;
49
50
51    private int direction;
52
53    /**
54     * Temporary symbol table used during parsing.
55     */
56    private ParseData parseData;
57
58    /**
59     * Temporary vector of set variables.  When parsing is complete, this
60     * is copied into the array data.variables.  As with data.variables,
61     * element 0 corresponds to character data.variablesBase.
62     */
63    private List<Object> variablesVector;
64
65    /**
66     * Temporary table of variable names.  When parsing is complete, this is
67     * copied into data.variableNames.
68     */
69    private Map<String, char[]> variableNames;
70
71    /**
72     * String of standins for segments.  Used during the parsing of a single
73     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
74     * to StringMatcher object segmentObjects.elementAt(0), etc.
75     */
76    private StringBuffer segmentStandins;
77
78    /**
79     * Vector of StringMatcher objects for segments.  Used during the
80     * parsing of a single rule.
81     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
82     * to StringMatcher object segmentObjects.elementAt(0), etc.
83     */
84    private List<StringMatcher> segmentObjects;
85
86    /**
87     * The next available stand-in for variables.  This starts at some point in
88     * the private use area (discovered dynamically) and increments up toward
89     * <code>variableLimit</code>.  At any point during parsing, available
90     * variables are <code>variableNext..variableLimit-1</code>.
91     */
92    private char variableNext;
93
94    /**
95     * The last available stand-in for variables.  This is discovered
96     * dynamically.  At any point during parsing, available variables are
97     * <code>variableNext..variableLimit-1</code>.  During variable definition
98     * we use the special value variableLimit-1 as a placeholder.
99     */
100    private char variableLimit;
101
102    /**
103     * When we encounter an undefined variable, we do not immediately signal
104     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
105     * Instead, we save the name of the undefined variable, and substitute
106     * in the placeholder char variableLimit - 1, and decrement
107     * variableLimit.
108     */
109    private String undefinedVariableName;
110
111    /**
112     * The stand-in character for the 'dot' set, represented by '.' in
113     * patterns.  This is allocated the first time it is needed, and
114     * reused thereafter.
115     */
116    private int dotStandIn = -1;
117
118    //----------------------------------------------------------------------
119    // Constants
120    //----------------------------------------------------------------------
121
122    // Indicator for ID blocks
123    private static final String ID_TOKEN = "::";
124    private static final int ID_TOKEN_LEN = 2;
125
126/*
127(reserved for future expansion)
128    // markers for beginning and end of rule groups
129    private static final String BEGIN_TOKEN = "BEGIN";
130    private static final String END_TOKEN = "END";
131*/
132
133    // Operators
134    private static final char VARIABLE_DEF_OP   = '=';
135    private static final char FORWARD_RULE_OP   = '>';
136    private static final char REVERSE_RULE_OP   = '<';
137    private static final char FWDREV_RULE_OP    = '~'; // internal rep of <> op
138
139    private static final String OPERATORS = "=><\u2190\u2192\u2194";
140    private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
141
142    // Other special characters
143    private static final char QUOTE               = '\'';
144    private static final char ESCAPE              = '\\';
145    private static final char END_OF_RULE         = ';';
146    private static final char RULE_COMMENT_CHAR   = '#';
147
148    private static final char CONTEXT_ANTE        = '{'; // ante{key
149    private static final char CONTEXT_POST        = '}'; // key}post
150    private static final char CURSOR_POS          = '|';
151    private static final char CURSOR_OFFSET       = '@';
152    private static final char ANCHOR_START        = '^';
153
154    private static final char KLEENE_STAR         = '*';
155    private static final char ONE_OR_MORE         = '+';
156    private static final char ZERO_OR_ONE         = '?';
157
158    private static final char DOT                 = '.';
159    private static final String DOT_SET           = "[^[:Zp:][:Zl:]\\r\\n$]";
160
161    // By definition, the ANCHOR_END special character is a
162    // trailing SymbolTable.SYMBOL_REF character.
163    // private static final char ANCHOR_END       = '$';
164
165    // Segments of the input string are delimited by "(" and ")".  In the
166    // output string these segments are referenced as "$1", "$2", etc.
167    private static final char SEGMENT_OPEN        = '(';
168    private static final char SEGMENT_CLOSE       = ')';
169
170    // A function is denoted &Source-Target/Variant(text)
171    private static final char FUNCTION            = '&';
172
173    // Aliases for some of the syntax characters. These are provided so
174    // transliteration rules can be expressed in XML without clashing with
175    // XML syntax characters '<', '>', and '&'.
176    private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
177    private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
178    private static final char ALT_FWDREV_RULE_OP  = '\u2194'; // Left Right Arrow
179    private static final char ALT_FUNCTION        = '\u2206'; // Increment (~Greek Capital Delta)
180
181    // Special characters disallowed at the top level
182    private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
183
184    // Special characters disallowed within a segment
185    private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
186
187    // Special characters disallowed within a function argument
188    private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
189
190    //----------------------------------------------------------------------
191    // class ParseData
192    //----------------------------------------------------------------------
193
194    /**
195     * This class implements the SymbolTable interface.  It is used
196     * during parsing to give UnicodeSet access to variables that
197     * have been defined so far.  Note that it uses variablesVector,
198     * _not_ data.variables.
199     */
200    private class ParseData implements SymbolTable {
201
202        /**
203         * Implement SymbolTable API.
204         */
205        public char[] lookup(String name) {
206            return variableNames.get(name);
207        }
208
209        /**
210         * Implement SymbolTable API.
211         */
212        public UnicodeMatcher lookupMatcher(int ch) {
213            // Note that we cannot use data.lookup() because the
214            // set array has not been constructed yet.
215            int i = ch - curData.variablesBase;
216            if (i >= 0 && i < variablesVector.size()) {
217                return (UnicodeMatcher) variablesVector.get(i);
218            }
219            return null;
220        }
221
222        /**
223         * Implement SymbolTable API.  Parse out a symbol reference
224         * name.
225         */
226        public String parseReference(String text, ParsePosition pos, int limit) {
227            int start = pos.getIndex();
228            int i = start;
229            while (i < limit) {
230                char c = text.charAt(i);
231                if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||
232                    !UCharacter.isUnicodeIdentifierPart(c)) {
233                    break;
234                }
235                ++i;
236            }
237            if (i == start) { // No valid name chars
238                return null;
239            }
240            pos.setIndex(i);
241            return text.substring(start, i);
242        }
243
244        /**
245         * Return true if the given character is a matcher standin or a plain
246         * character (non standin).
247         */
248        public boolean isMatcher(int ch) {
249            // Note that we cannot use data.lookup() because the
250            // set array has not been constructed yet.
251            int i = ch - curData.variablesBase;
252            if (i >= 0 && i < variablesVector.size()) {
253                return variablesVector.get(i) instanceof UnicodeMatcher;
254            }
255            return true;
256        }
257
258        /**
259         * Return true if the given character is a replacer standin or a plain
260         * character (non standin).
261         */
262        public boolean isReplacer(int ch) {
263            // Note that we cannot use data.lookup() because the
264            // set array has not been constructed yet.
265            int i = ch - curData.variablesBase;
266            if (i >= 0 && i < variablesVector.size()) {
267                return variablesVector.get(i) instanceof UnicodeReplacer;
268            }
269            return true;
270        }
271    }
272
273    //----------------------------------------------------------------------
274    // classes RuleBody, RuleArray, and RuleReader
275    //----------------------------------------------------------------------
276
277    /**
278     * A private abstract class representing the interface to rule
279     * source code that is broken up into lines.  Handles the
280     * folding of lines terminated by a backslash.  This folding
281     * is limited; it does not account for comments, quotes, or
282     * escapes, so its use to be limited.
283     */
284    private static abstract class RuleBody {
285
286        /**
287         * Retrieve the next line of the source, or return null if
288         * none.  Folds lines terminated by a backslash into the
289         * next line, without regard for comments, quotes, or
290         * escapes.
291         */
292        String nextLine() {
293            String s = handleNextLine();
294            if (s != null &&
295                s.length() > 0 &&
296                s.charAt(s.length() - 1) == '\\') {
297                StringBuilder b = new StringBuilder(s);
298                do {
299                    b.deleteCharAt(b.length()-1);
300                    s = handleNextLine();
301                    if (s == null) {
302                        break;
303                    }
304                    b.append(s);
305                } while (s.length() > 0 &&
306                         s.charAt(s.length() - 1) == '\\');
307                s = b.toString();
308            }
309            return s;
310        }
311
312        /**
313         * Reset to the first line of the source.
314         */
315        abstract void reset();
316
317        /**
318         * Subclass method to return the next line of the source.
319         */
320        abstract String handleNextLine();
321    }
322
323    /**
324     * RuleBody subclass for a String[] array.
325     */
326    private static class RuleArray extends RuleBody {
327        String[] array;
328        int i;
329        public RuleArray(String[] array) { this.array = array; i = 0; }
330        public String handleNextLine() {
331            return (i < array.length) ? array[i++] : null;
332        }
333        public void reset() {
334            i = 0;
335        }
336    }
337
338    /*
339     * RuleBody subclass for a ResourceReader.
340     */
341/*    private static class RuleReader extends RuleBody {
342        ResourceReader reader;
343        public RuleReader(ResourceReader reader) { this.reader = reader; }
344        public String handleNextLine() {
345            try {
346                return reader.readLine();
347            } catch (java.io.IOException e) {}
348            return null;
349        }
350        public void reset() {
351            reader.reset();
352        }
353    }*/
354
355    //----------------------------------------------------------------------
356    // class RuleHalf
357    //----------------------------------------------------------------------
358
359    /**
360     * A class representing one side of a rule.  This class knows how to
361     * parse half of a rule.  It is tightly coupled to the method
362     * TransliteratorParser.parseRule().
363     */
364    private static class RuleHalf {
365
366        public String text;
367
368        public int cursor = -1; // position of cursor in text
369        public int ante = -1;   // position of ante context marker '{' in text
370        public int post = -1;   // position of post context marker '}' in text
371
372        // Record the offset to the cursor either to the left or to the
373        // right of the key.  This is indicated by characters on the output
374        // side that allow the cursor to be positioned arbitrarily within
375        // the matching text.  For example, abc{def} > | @@@ xyz; changes
376        // def to xyz and moves the cursor to before abc.  Offset characters
377        // must be at the start or end, and they cannot move the cursor past
378        // the ante- or postcontext text.  Placeholders are only valid in
379        // output text.  The length of the ante and post context is
380        // determined at runtime, because of supplementals and quantifiers.
381        public int cursorOffset = 0; // only nonzero on output side
382
383        // Position of first CURSOR_OFFSET on _right_.  This will be -1
384        // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
385        private int cursorOffsetPos = 0;
386
387        public boolean anchorStart = false;
388        public boolean anchorEnd   = false;
389
390        /**
391         * The segment number from 1..n of the next '(' we see
392         * during parsing; 1-based.
393         */
394        private int nextSegmentNumber = 1;
395
396        /**
397         * Parse one side of a rule, stopping at either the limit,
398         * the END_OF_RULE character, or an operator.
399         * @return the index after the terminating character, or
400         * if limit was reached, limit
401         */
402        public int parse(String rule, int pos, int limit,
403                         TransliteratorParser parser) {
404            int start = pos;
405            StringBuffer buf = new StringBuffer();
406            pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
407            text = buf.toString();
408
409            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
410                syntaxError("Misplaced " + CURSOR_POS, rule, start);
411            }
412
413            return pos;
414        }
415
416        /**
417         * Parse a section of one side of a rule, stopping at either
418         * the limit, the END_OF_RULE character, an operator, or a
419         * segment close character.  This method parses both a
420         * top-level rule half and a segment within such a rule half.
421         * It calls itself recursively to parse segments and nested
422         * segments.
423         * @param buf buffer into which to accumulate the rule pattern
424         * characters, either literal characters from the rule or
425         * standins for UnicodeMatcher objects including segments.
426         * @param illegal the set of special characters that is illegal during
427         * this parse.
428         * @param isSegment if true, then we've already seen a '(' and
429         * pos on entry points right after it.  Accumulate everything
430         * up to the closing ')', put it in a segment matcher object,
431         * generate a standin for it, and add the standin to buf.  As
432         * a side effect, update the segments vector with a reference
433         * to the segment matcher.  This works recursively for nested
434         * segments.  If isSegment is false, just accumulate
435         * characters into buf.
436         * @return the index after the terminating character, or
437         * if limit was reached, limit
438         */
439        private int parseSection(String rule, int pos, int limit,
440                                 TransliteratorParser parser,
441                                 StringBuffer buf,
442                                 UnicodeSet illegal,
443                                 boolean isSegment) {
444            int start = pos;
445            ParsePosition pp = null;
446            int quoteStart = -1; // Most recent 'single quoted string'
447            int quoteLimit = -1;
448            int varStart = -1; // Most recent $variableReference
449            int varLimit = -1;
450            int[] iref = new int[1];
451            int bufStart = buf.length();
452
453        main:
454            while (pos < limit) {
455                // Since all syntax characters are in the BMP, fetching
456                // 16-bit code units suffices here.
457                char c = rule.charAt(pos++);
458                if (PatternProps.isWhiteSpace(c)) {
459                    continue;
460                }
461                // HALF_ENDERS is all chars that end a rule half: "<>=;"
462                if (HALF_ENDERS.indexOf(c) >= 0) {
463                    ///CLOVER:OFF
464                    // isSegment is always false
465                    if (isSegment) {
466                        syntaxError("Unclosed segment", rule, start);
467                    }
468                    ///CLOVER:ON
469                    break main;
470                }
471                if (anchorEnd) {
472                    // Text after a presumed end anchor is a syntax err
473                    syntaxError("Malformed variable reference", rule, start);
474                }
475                if (UnicodeSet.resemblesPattern(rule, pos-1)) {
476                    if (pp == null) {
477                        pp = new ParsePosition(0);
478                    }
479                    pp.setIndex(pos-1); // Backup to opening '['
480                    buf.append(parser.parseSet(rule, pp));
481                    pos = pp.getIndex();
482                    continue;
483                }
484                // Handle escapes
485                if (c == ESCAPE) {
486                    if (pos == limit) {
487                        syntaxError("Trailing backslash", rule, start);
488                    }
489                    iref[0] = pos;
490                    int escaped = Utility.unescapeAt(rule, iref);
491                    pos = iref[0];
492                    if (escaped == -1) {
493                        syntaxError("Malformed escape", rule, start);
494                    }
495                    parser.checkVariableRange(escaped, rule, start);
496                    UTF16.append(buf, escaped);
497                    continue;
498                }
499                // Handle quoted matter
500                if (c == QUOTE) {
501                    int iq = rule.indexOf(QUOTE, pos);
502                    if (iq == pos) {
503                        buf.append(c); // Parse [''] outside quotes as [']
504                        ++pos;
505                    } else {
506                        /* This loop picks up a run of quoted text of the
507                         * form 'aaaa' each time through.  If this run
508                         * hasn't really ended ('aaaa''bbbb') then it keeps
509                         * looping, each time adding on a new run.  When it
510                         * reaches the final quote it breaks.
511                         */
512                        quoteStart = buf.length();
513                        for (;;) {
514                            if (iq < 0) {
515                                syntaxError("Unterminated quote", rule, start);
516                            }
517                            buf.append(rule.substring(pos, iq));
518                            pos = iq+1;
519                            if (pos < limit && rule.charAt(pos) == QUOTE) {
520                            // Parse [''] inside quotes as [']
521                                iq = rule.indexOf(QUOTE, pos+1);
522                            // Continue looping
523                            } else {
524                                break;
525                            }
526                        }
527                        quoteLimit = buf.length();
528
529                        for (iq=quoteStart; iq<quoteLimit; ++iq) {
530                            parser.checkVariableRange(buf.charAt(iq), rule, start);
531                        }
532                    }
533                    continue;
534                }
535
536                parser.checkVariableRange(c, rule, start);
537
538                if (illegal.contains(c)) {
539                    syntaxError("Illegal character '" + c + '\'', rule, start);
540                }
541
542                switch (c) {
543
544                //------------------------------------------------------
545                // Elements allowed within and out of segments
546                //------------------------------------------------------
547                case ANCHOR_START:
548                    if (buf.length() == 0 && !anchorStart) {
549                        anchorStart = true;
550                    } else {
551                        syntaxError("Misplaced anchor start",
552                                    rule, start);
553                    }
554                    break;
555                case SEGMENT_OPEN:
556                    {
557                        // bufSegStart is the offset in buf to the first
558                        // character of the segment we are parsing.
559                        int bufSegStart = buf.length();
560
561                        // Record segment number now, since nextSegmentNumber
562                        // will be incremented during the call to parseSection
563                        // if there are nested segments.
564                        int segmentNumber = nextSegmentNumber++; // 1-based
565
566                        // Parse the segment
567                        pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
568
569                        // After parsing a segment, the relevant characters are
570                        // in buf, starting at offset bufSegStart.  Extract them
571                        // into a string matcher, and replace them with a
572                        // standin for that matcher.
573                        StringMatcher m =
574                            new StringMatcher(buf.substring(bufSegStart),
575                                              segmentNumber, parser.curData);
576
577                        // Record and associate object and segment number
578                        parser.setSegmentObject(segmentNumber, m);
579                        buf.setLength(bufSegStart);
580                        buf.append(parser.getSegmentStandin(segmentNumber));
581                    }
582                    break;
583                case FUNCTION:
584                case ALT_FUNCTION:
585                    {
586                        iref[0] = pos;
587                        TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
588                        // The next character MUST be a segment open
589                        if (single == null ||
590                            !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
591                            syntaxError("Invalid function", rule, start);
592                        }
593
594                        Transliterator t = single.getInstance();
595                        if (t == null) {
596                            syntaxError("Invalid function ID", rule, start);
597                        }
598
599                        // bufSegStart is the offset in buf to the first
600                        // character of the segment we are parsing.
601                        int bufSegStart = buf.length();
602
603                        // Parse the segment
604                        pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
605
606                        // After parsing a segment, the relevant characters are
607                        // in buf, starting at offset bufSegStart.
608                        FunctionReplacer r =
609                            new FunctionReplacer(t,
610                                new StringReplacer(buf.substring(bufSegStart), parser.curData));
611
612                        // Replace the buffer contents with a stand-in
613                        buf.setLength(bufSegStart);
614                        buf.append(parser.generateStandInFor(r));
615                    }
616                    break;
617                case SymbolTable.SYMBOL_REF:
618                    // Handle variable references and segment references "$1" .. "$9"
619                    {
620                        // A variable reference must be followed immediately
621                        // by a Unicode identifier start and zero or more
622                        // Unicode identifier part characters, or by a digit
623                        // 1..9 if it is a segment reference.
624                        if (pos == limit) {
625                            // A variable ref character at the end acts as
626                            // an anchor to the context limit, as in perl.
627                            anchorEnd = true;
628                            break;
629                        }
630                        // Parse "$1" "$2" .. "$9" .. (no upper limit)
631                        c = rule.charAt(pos);
632                        int r = UCharacter.digit(c, 10);
633                        if (r >= 1 && r <= 9) {
634                            iref[0] = pos;
635                            r = Utility.parseNumber(rule, iref, 10);
636                            if (r < 0) {
637                                syntaxError("Undefined segment reference",
638                                            rule, start);
639                            }
640                            pos = iref[0];
641                            buf.append(parser.getSegmentStandin(r));
642                        } else {
643                            if (pp == null) { // Lazy create
644                                pp = new ParsePosition(0);
645                            }
646                            pp.setIndex(pos);
647                            String name = parser.parseData.
648                                parseReference(rule, pp, limit);
649                            if (name == null) {
650                                // This means the '$' was not followed by a
651                                // valid name.  Try to interpret it as an
652                                // end anchor then.  If this also doesn't work
653                                // (if we see a following character) then signal
654                                // an error.
655                                anchorEnd = true;
656                                break;
657                            }
658                            pos = pp.getIndex();
659                            // If this is a variable definition statement,
660                            // then the LHS variable will be undefined.  In
661                            // that case appendVariableDef() will append the
662                            // special placeholder char variableLimit-1.
663                            varStart = buf.length();
664                            parser.appendVariableDef(name, buf);
665                            varLimit = buf.length();
666                        }
667                    }
668                    break;
669                case DOT:
670                    buf.append(parser.getDotStandIn());
671                    break;
672                case KLEENE_STAR:
673                case ONE_OR_MORE:
674                case ZERO_OR_ONE:
675                    // Quantifiers.  We handle single characters, quoted strings,
676                    // variable references, and segments.
677                    //  a+      matches  aaa
678                    //  'foo'+  matches  foofoofoo
679                    //  $v+     matches  xyxyxy if $v == xy
680                    //  (seg)+  matches  segsegseg
681                    {
682                        ///CLOVER:OFF
683                        // isSegment is always false
684                        if (isSegment && buf.length() == bufStart) {
685                            // The */+ immediately follows '('
686                            syntaxError("Misplaced quantifier", rule, start);
687                            break;
688                        }
689                        ///CLOVER:ON
690
691                        int qstart, qlimit;
692                        // The */+ follows an isolated character or quote
693                        // or variable reference
694                        if (buf.length() == quoteLimit) {
695                            // The */+ follows a 'quoted string'
696                            qstart = quoteStart;
697                            qlimit = quoteLimit;
698                        } else if (buf.length() == varLimit) {
699                            // The */+ follows a $variableReference
700                            qstart = varStart;
701                            qlimit = varLimit;
702                        } else {
703                            // The */+ follows a single character, possibly
704                            // a segment standin
705                            qstart = buf.length() - 1;
706                            qlimit = qstart + 1;
707                        }
708
709                        UnicodeMatcher m;
710                        try {
711                            m = new StringMatcher(buf.toString(), qstart, qlimit,
712                                              0, parser.curData);
713                        } catch (RuntimeException e) {
714                            final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
715                            final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";
716                            throw (RuntimeException)
717                                new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"
718                                        + postContext).initCause(e);
719                        }
720                        int min = 0;
721                        int max = Quantifier.MAX;
722                        switch (c) {
723                        case ONE_OR_MORE:
724                            min = 1;
725                            break;
726                        case ZERO_OR_ONE:
727                            min = 0;
728                            max = 1;
729                            break;
730                            // case KLEENE_STAR:
731                            //    do nothing -- min, max already set
732                        }
733                        m = new Quantifier(m, min, max);
734                        buf.setLength(qstart);
735                        buf.append(parser.generateStandInFor(m));
736                    }
737                    break;
738
739                //------------------------------------------------------
740                // Elements allowed ONLY WITHIN segments
741                //------------------------------------------------------
742                case SEGMENT_CLOSE:
743                    // assert(isSegment);
744                    // We're done parsing a segment.
745                    break main;
746
747                //------------------------------------------------------
748                // Elements allowed ONLY OUTSIDE segments
749                //------------------------------------------------------
750                case CONTEXT_ANTE:
751                    if (ante >= 0) {
752                        syntaxError("Multiple ante contexts", rule, start);
753                    }
754                    ante = buf.length();
755                    break;
756                case CONTEXT_POST:
757                    if (post >= 0) {
758                        syntaxError("Multiple post contexts", rule, start);
759                    }
760                    post = buf.length();
761                    break;
762                case CURSOR_POS:
763                    if (cursor >= 0) {
764                        syntaxError("Multiple cursors", rule, start);
765                    }
766                    cursor = buf.length();
767                    break;
768                case CURSOR_OFFSET:
769                    if (cursorOffset < 0) {
770                        if (buf.length() > 0) {
771                            syntaxError("Misplaced " + c, rule, start);
772                        }
773                        --cursorOffset;
774                    } else if (cursorOffset > 0) {
775                        if (buf.length() != cursorOffsetPos || cursor >= 0) {
776                            syntaxError("Misplaced " + c, rule, start);
777                        }
778                        ++cursorOffset;
779                    } else {
780                        if (cursor == 0 && buf.length() == 0) {
781                            cursorOffset = -1;
782                        } else if (cursor < 0) {
783                            cursorOffsetPos = buf.length();
784                            cursorOffset = 1;
785                        } else {
786                            syntaxError("Misplaced " + c, rule, start);
787                        }
788                    }
789                    break;
790
791                //------------------------------------------------------
792                // Non-special characters
793                //------------------------------------------------------
794                default:
795                    // Disallow unquoted characters other than [0-9A-Za-z]
796                    // in the printable ASCII range.  These characters are
797                    // reserved for possible future use.
798                    if (c >= 0x0021 && c <= 0x007E &&
799                        !((c >= '0' && c <= '9') ||
800                          (c >= 'A' && c <= 'Z') ||
801                          (c >= 'a' && c <= 'z'))) {
802                        syntaxError("Unquoted " + c, rule, start);
803                    }
804                    buf.append(c);
805                    break;
806                }
807            }
808            return pos;
809        }
810
811        /**
812         * Remove context.
813         */
814        void removeContext() {
815            text = text.substring(ante < 0 ? 0 : ante,
816                                  post < 0 ? text.length() : post);
817            ante = post = -1;
818            anchorStart = anchorEnd = false;
819        }
820
821        /**
822         * Return true if this half looks like valid output, that is, does not
823         * contain quantifiers or other special input-only elements.
824         */
825        public boolean isValidOutput(TransliteratorParser parser) {
826            for (int i=0; i<text.length(); ) {
827                int c = UTF16.charAt(text, i);
828                i += UTF16.getCharCount(c);
829                if (!parser.parseData.isReplacer(c)) {
830                    return false;
831                }
832            }
833            return true;
834        }
835
836        /**
837         * Return true if this half looks like valid input, that is, does not
838         * contain functions or other special output-only elements.
839         */
840        public boolean isValidInput(TransliteratorParser parser) {
841            for (int i=0; i<text.length(); ) {
842                int c = UTF16.charAt(text, i);
843                i += UTF16.getCharCount(c);
844                if (!parser.parseData.isMatcher(c)) {
845                    return false;
846                }
847            }
848            return true;
849        }
850    }
851
852    //----------------------------------------------------------------------
853    // PUBLIC methods
854    //----------------------------------------------------------------------
855
856    /**
857     * Constructor.
858     */
859    public TransliteratorParser() {
860    }
861
862    /**
863     * Parse a set of rules.  After the parse completes, examine the public
864     * data members for results.
865     */
866    public void parse(String rules, int dir) {
867        parseRules(new RuleArray(new String[] { rules }), dir);
868    }
869
870    /*
871     * Parse a set of rules.  After the parse completes, examine the public
872     * data members for results.
873     */
874/*    public void parse(ResourceReader rules, int direction) {
875        parseRules(new RuleReader(rules), direction);
876    }*/
877
878    //----------------------------------------------------------------------
879    // PRIVATE methods
880    //----------------------------------------------------------------------
881
882    /**
883     * Parse an array of zero or more rules.  The strings in the array are
884     * treated as if they were concatenated together, with rule terminators
885     * inserted between array elements if not present already.
886     *
887     * Any previous rules are discarded.  Typically this method is called exactly
888     * once, during construction.
889     *
890     * The member this.data will be set to null if there are no rules.
891     *
892     * @exception IllegalIcuArgumentException if there is a syntax error in the
893     * rules
894     */
895    void parseRules(RuleBody ruleArray, int dir) {
896        boolean parsingIDs = true;
897        int ruleCount = 0;
898
899        dataVector = new ArrayList<Data>();
900        idBlockVector = new ArrayList<String>();
901        curData = null;
902        direction = dir;
903        compoundFilter = null;
904        variablesVector = new ArrayList<Object>();
905        variableNames = new HashMap<String, char[]>();
906        parseData = new ParseData();
907
908        List<RuntimeException> errors = new ArrayList<RuntimeException>();
909        int errorCount = 0;
910
911        ruleArray.reset();
912
913        StringBuilder idBlockResult = new StringBuilder();
914
915        // The compound filter offset is an index into idBlockResult.
916        // If it is 0, then the compound filter occurred at the start,
917        // and it is the offset to the _start_ of the compound filter
918        // pattern.  Otherwise it is the offset to the _limit_ of the
919        // compound filter pattern within idBlockResult.
920        this.compoundFilter = null;
921        int compoundFilterOffset = -1;
922
923    main:
924        for (;;) {
925            String rule = ruleArray.nextLine();
926            if (rule == null) {
927                break;
928            }
929            int pos = 0;
930            int limit = rule.length();
931            while (pos < limit) {
932                char c = rule.charAt(pos++);
933                if (PatternProps.isWhiteSpace(c)) {
934                    continue;
935                }
936                // Skip lines starting with the comment character
937                if (c == RULE_COMMENT_CHAR) {
938                    pos = rule.indexOf("\n", pos) + 1;
939                    if (pos == 0) {
940                        break; // No "\n" found; rest of rule is a commnet
941                    }
942                    continue; // Either fall out or restart with next line
943                }
944
945                // skip empty rules
946                if (c == END_OF_RULE)
947                    continue;
948
949                // Often a rule file contains multiple errors.  It's
950                // convenient to the rule author if these are all reported
951                // at once.  We keep parsing rules even after a failure, up
952                // to a specified limit, and report all errors at once.
953                try {
954                    ++ruleCount;
955
956                    // We've found the start of a rule or ID.  c is its first
957                    // character, and pos points past c.
958                    --pos;
959                    // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
960                    // chars left.
961                    if ((pos + ID_TOKEN_LEN + 1) <= limit &&
962                            rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
963                        pos += ID_TOKEN_LEN;
964                        c = rule.charAt(pos);
965                        while (PatternProps.isWhiteSpace(c) && pos < limit) {
966                            ++pos;
967                            c = rule.charAt(pos);
968                        }
969                        int[] p = new int[] { pos };
970
971                        if (!parsingIDs) {
972                            if (curData != null) {
973                                if (direction == Transliterator.FORWARD)
974                                    dataVector.add(curData);
975                                else
976                                    dataVector.add(0, curData);
977                                curData = null;
978                            }
979                            parsingIDs = true;
980                        }
981
982                        TransliteratorIDParser.SingleID id =
983                            TransliteratorIDParser.parseSingleID(
984                                          rule, p, direction);
985                        if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
986                            // Successful ::ID parse.
987
988                            if (direction == Transliterator.FORWARD) {
989                                idBlockResult.append(id.canonID).append(END_OF_RULE);
990                            } else {
991                                idBlockResult.insert(0, id.canonID + END_OF_RULE);
992                            }
993
994                        } else {
995                            // Couldn't parse an ID.  Try to parse a global filter
996                            int[] withParens = new int[] { -1 };
997                            UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
998                            if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
999                                if ((direction == Transliterator.FORWARD) ==
1000                                    (withParens[0] == 0)) {
1001                                    if (compoundFilter != null) {
1002                                        // Multiple compound filters
1003                                        syntaxError("Multiple global filters", rule, pos);
1004                                    }
1005                                    compoundFilter = f;
1006                                    compoundFilterOffset = ruleCount;
1007                               }
1008                            } else {
1009                                // Invalid ::id
1010                                // Can be parsed as neither an ID nor a global filter
1011                                syntaxError("Invalid ::ID", rule, pos);
1012                            }
1013                        }
1014
1015                        pos = p[0];
1016                    } else {
1017                        if (parsingIDs) {
1018                            if (direction == Transliterator.FORWARD)
1019                                idBlockVector.add(idBlockResult.toString());
1020                            else
1021                                idBlockVector.add(0, idBlockResult.toString());
1022                            idBlockResult.delete(0, idBlockResult.length());
1023                            parsingIDs = false;
1024                            curData = new RuleBasedTransliterator.Data();
1025
1026                            // By default, rules use part of the private use area
1027                            // E000..F8FF for variables and other stand-ins.  Currently
1028                            // the range F000..F8FF is typically sufficient.  The 'use
1029                            // variable range' pragma allows rule sets to modify this.
1030                            setVariableRange(0xF000, 0xF8FF);
1031                        }
1032
1033                        if (resemblesPragma(rule, pos, limit)) {
1034                            int ppp = parsePragma(rule, pos, limit);
1035                            if (ppp < 0) {
1036                                syntaxError("Unrecognized pragma", rule, pos);
1037                            }
1038                            pos = ppp;
1039                        // Parse a rule
1040                        } else {
1041                            pos = parseRule(rule, pos, limit);
1042                        }
1043                    }
1044                } catch (IllegalArgumentException e) {
1045                    if (errorCount == 30) {
1046                        IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched");
1047                        icuEx.initCause(e);
1048                        errors.add(icuEx);
1049                        break main;
1050                    }
1051                    e.fillInStackTrace();
1052                    errors.add(e);
1053                    ++errorCount;
1054                    pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1055                }
1056            }
1057        }
1058        if (parsingIDs && idBlockResult.length() > 0) {
1059            if (direction == Transliterator.FORWARD)
1060                idBlockVector.add(idBlockResult.toString());
1061            else
1062                idBlockVector.add(0, idBlockResult.toString());
1063        }
1064        else if (!parsingIDs && curData != null) {
1065            if (direction == Transliterator.FORWARD)
1066                dataVector.add(curData);
1067            else
1068                dataVector.add(0, curData);
1069        }
1070
1071        // Convert the set vector to an array
1072        for (int i = 0; i < dataVector.size(); i++) {
1073            Data data = dataVector.get(i);
1074            data.variables = new Object[variablesVector.size()];
1075            variablesVector.toArray(data.variables);
1076            data.variableNames = new HashMap<String, char[]>();
1077            data.variableNames.putAll(variableNames);
1078        }
1079        variablesVector = null;
1080
1081        // Do more syntax checking and index the rules
1082        try {
1083            if (compoundFilter != null) {
1084                if ((direction == Transliterator.FORWARD &&
1085                     compoundFilterOffset != 1) ||
1086                    (direction == Transliterator.REVERSE &&
1087                     compoundFilterOffset != ruleCount)) {
1088                    throw new IllegalIcuArgumentException("Compound filters misplaced");
1089                }
1090            }
1091
1092            for (int i = 0; i < dataVector.size(); i++) {
1093                Data data = dataVector.get(i);
1094                data.ruleSet.freeze();
1095            }
1096
1097            if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0)
1098                idBlockVector.remove(0);
1099
1100        } catch (IllegalArgumentException e) {
1101            e.fillInStackTrace();
1102            errors.add(e);
1103        }
1104
1105        if (errors.size() != 0) {
1106            for (int i = errors.size()-1; i > 0; --i) {
1107                RuntimeException previous = errors.get(i-1);
1108                while (previous.getCause() != null) {
1109                    previous = (RuntimeException) previous.getCause(); // chain specially
1110                }
1111                previous.initCause(errors.get(i));
1112            }
1113            throw errors.get(0);
1114            // if initCause not supported: throw new IllegalArgumentException(errors.toString());
1115        }
1116    }
1117
1118    /**
1119     * MAIN PARSER.  Parse the next rule in the given rule string, starting
1120     * at pos.  Return the index after the last character parsed.  Do not
1121     * parse characters at or after limit.
1122     *
1123     * Important:  The character at pos must be a non-whitespace character
1124     * that is not the comment character.
1125     *
1126     * This method handles quoting, escaping, and whitespace removal.  It
1127     * parses the end-of-rule character.  It recognizes context and cursor
1128     * indicators.  Once it does a lexical breakdown of the rule at pos, it
1129     * creates a rule object and adds it to our rule list.
1130     *
1131     * This method is tightly coupled to the inner class RuleHalf.
1132     */
1133    private int parseRule(String rule, int pos, int limit) {
1134        // Locate the left side, operator, and right side
1135        int start = pos;
1136        char operator = 0;
1137
1138        // Set up segments data
1139        segmentStandins = new StringBuffer();
1140        segmentObjects = new ArrayList<StringMatcher>();
1141
1142        RuleHalf left  = new RuleHalf();
1143        RuleHalf right = new RuleHalf();
1144
1145        undefinedVariableName = null;
1146        pos = left.parse(rule, pos, limit, this);
1147
1148        if (pos == limit ||
1149            OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1150            syntaxError("No operator pos=" + pos, rule, start);
1151        }
1152        ++pos;
1153
1154        // Found an operator char.  Check for forward-reverse operator.
1155        if (operator == REVERSE_RULE_OP &&
1156            (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1157            ++pos;
1158            operator = FWDREV_RULE_OP;
1159        }
1160
1161        // Translate alternate op characters.
1162        switch (operator) {
1163        case ALT_FORWARD_RULE_OP:
1164            operator = FORWARD_RULE_OP;
1165            break;
1166        case ALT_REVERSE_RULE_OP:
1167            operator = REVERSE_RULE_OP;
1168            break;
1169        case ALT_FWDREV_RULE_OP:
1170            operator = FWDREV_RULE_OP;
1171            break;
1172        }
1173
1174        pos = right.parse(rule, pos, limit, this);
1175
1176        if (pos < limit) {
1177            if (rule.charAt(--pos) == END_OF_RULE) {
1178                ++pos;
1179            } else {
1180                // RuleHalf parser must have terminated at an operator
1181                syntaxError("Unquoted operator", rule, start);
1182            }
1183        }
1184
1185        if (operator == VARIABLE_DEF_OP) {
1186            // LHS is the name.  RHS is a single character, either a literal
1187            // or a set (already parsed).  If RHS is longer than one
1188            // character, it is either a multi-character string, or multiple
1189            // sets, or a mixture of chars and sets -- syntax error.
1190
1191            // We expect to see a single undefined variable (the one being
1192            // defined).
1193            if (undefinedVariableName == null) {
1194                syntaxError("Missing '$' or duplicate definition", rule, start);
1195            }
1196            if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
1197                syntaxError("Malformed LHS", rule, start);
1198            }
1199            if (left.anchorStart || left.anchorEnd ||
1200                right.anchorStart || right.anchorEnd) {
1201                syntaxError("Malformed variable def", rule, start);
1202            }
1203            // We allow anything on the right, including an empty string.
1204            int n = right.text.length();
1205            char[] value = new char[n];
1206            right.text.getChars(0, n, value, 0);
1207            variableNames.put(undefinedVariableName, value);
1208
1209            ++variableLimit;
1210            return pos;
1211        }
1212
1213        // If this is not a variable definition rule, we shouldn't have
1214        // any undefined variable names.
1215        if (undefinedVariableName != null) {
1216            syntaxError("Undefined variable $" + undefinedVariableName,
1217                        rule, start);
1218        }
1219
1220        // Verify segments
1221        if (segmentStandins.length() > segmentObjects.size()) {
1222            syntaxError("Undefined segment reference", rule, start);
1223        }
1224        for (int i=0; i<segmentStandins.length(); ++i) {
1225            if (segmentStandins.charAt(i) == 0) {
1226                syntaxError("Internal error", rule, start); // will never happen
1227            }
1228        }
1229        for (int i=0; i<segmentObjects.size(); ++i) {
1230            if (segmentObjects.get(i) == null) {
1231                syntaxError("Internal error", rule, start); // will never happen
1232            }
1233        }
1234
1235        // If the direction we want doesn't match the rule
1236        // direction, do nothing.
1237        if (operator != FWDREV_RULE_OP &&
1238            ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1239            return pos;
1240        }
1241
1242        // Transform the rule into a forward rule by swapping the
1243        // sides if necessary.
1244        if (direction == Transliterator.REVERSE) {
1245            RuleHalf temp = left;
1246            left = right;
1247            right = temp;
1248        }
1249
1250        // Remove non-applicable elements in forward-reverse
1251        // rules.  Bidirectional rules ignore elements that do not
1252        // apply.
1253        if (operator == FWDREV_RULE_OP) {
1254            right.removeContext();
1255            left.cursor = -1;
1256            left.cursorOffset = 0;
1257        }
1258
1259        // Normalize context
1260        if (left.ante < 0) {
1261            left.ante = 0;
1262        }
1263        if (left.post < 0) {
1264            left.post = left.text.length();
1265        }
1266
1267        // Context is only allowed on the input side.  Cursors are only
1268        // allowed on the output side.  Segment delimiters can only appear
1269        // on the left, and references on the right.  Cursor offset
1270        // cannot appear without an explicit cursor.  Cursor offset
1271        // cannot place the cursor outside the limits of the context.
1272        // Anchors are only allowed on the input side.
1273        if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
1274            (right.cursorOffset != 0 && right.cursor < 0) ||
1275            // - The following two checks were used to ensure that the
1276            // - the cursor offset stayed within the ante- or postcontext.
1277            // - However, with the addition of quantifiers, we have to
1278            // - allow arbitrary cursor offsets and do runtime checking.
1279            //(right.cursorOffset > (left.text.length() - left.post)) ||
1280            //(-right.cursorOffset > left.ante) ||
1281            right.anchorStart || right.anchorEnd ||
1282            !left.isValidInput(this) || !right.isValidOutput(this) ||
1283            left.ante > left.post) {
1284            syntaxError("Malformed rule", rule, start);
1285        }
1286
1287        // Flatten segment objects vector to an array
1288        UnicodeMatcher[] segmentsArray = null;
1289        if (segmentObjects.size() > 0) {
1290            segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1291            segmentObjects.toArray(segmentsArray);
1292        }
1293
1294        curData.ruleSet.addRule(new TransliterationRule(
1295                                     left.text, left.ante, left.post,
1296                                     right.text, right.cursor, right.cursorOffset,
1297                                     segmentsArray,
1298                                     left.anchorStart, left.anchorEnd,
1299                                     curData));
1300
1301        return pos;
1302    }
1303
1304    /**
1305     * Set the variable range to [start, end] (inclusive).
1306     */
1307    private void setVariableRange(int start, int end) {
1308        if (start > end || start < 0 || end > 0xFFFF) {
1309            throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
1310        }
1311
1312        curData.variablesBase = (char) start; // first private use
1313
1314        if (dataVector.size() == 0) {
1315            variableNext = (char) start;
1316            variableLimit = (char) (end + 1);
1317        }
1318    }
1319
1320    /**
1321     * Assert that the given character is NOT within the variable range.
1322     * If it is, signal an error.  This is neccesary to ensure that the
1323     * variable range does not overlap characters used in a rule.
1324     */
1325    private void checkVariableRange(int ch, String rule, int start) {
1326        if (ch >= curData.variablesBase && ch < variableLimit) {
1327            syntaxError("Variable range character in rule", rule, start);
1328        }
1329    }
1330
1331    // (The following method is part of an unimplemented feature.
1332    // Remove this clover pragma after the feature is implemented.
1333    // 2003-06-11 ICU 2.6 Alan)
1334    ///CLOVER:OFF
1335    /**
1336     * Set the maximum backup to 'backup', in response to a pragma
1337     * statement.
1338     */
1339    private void pragmaMaximumBackup(int backup) {
1340        //TODO Finish
1341        throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
1342    }
1343    ///CLOVER:ON
1344
1345    // (The following method is part of an unimplemented feature.
1346    // Remove this clover pragma after the feature is implemented.
1347    // 2003-06-11 ICU 2.6 Alan)
1348    ///CLOVER:OFF
1349    /**
1350     * Begin normalizing all rules using the given mode, in response
1351     * to a pragma statement.
1352     */
1353    private void pragmaNormalizeRules(Normalizer.Mode mode) {
1354        //TODO Finish
1355        throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
1356    }
1357    ///CLOVER:ON
1358
1359    /**
1360     * Return true if the given rule looks like a pragma.
1361     * @param pos offset to the first non-whitespace character
1362     * of the rule.
1363     * @param limit pointer past the last character of the rule.
1364     */
1365    static boolean resemblesPragma(String rule, int pos, int limit) {
1366        // Must start with /use\s/i
1367        return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1368    }
1369
1370    /**
1371     * Parse a pragma.  This method assumes resemblesPragma() has
1372     * already returned true.
1373     * @param pos offset to the first non-whitespace character
1374     * of the rule.
1375     * @param limit pointer past the last character of the rule.
1376     * @return the position index after the final ';' of the pragma,
1377     * or -1 on failure.
1378     */
1379    private int parsePragma(String rule, int pos, int limit) {
1380        int[] array = new int[2];
1381
1382        // resemblesPragma() has already returned true, so we
1383        // know that pos points to /use\s/i; we can skip 4 characters
1384        // immediately
1385        pos += 4;
1386
1387        // Here are the pragmas we recognize:
1388        // use variable range 0xE000 0xEFFF;
1389        // use maximum backup 16;
1390        // use nfd rules;
1391        int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
1392        if (p >= 0) {
1393            setVariableRange(array[0], array[1]);
1394            return p;
1395        }
1396
1397        p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
1398        if (p >= 0) {
1399            pragmaMaximumBackup(array[0]);
1400            return p;
1401        }
1402
1403        p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
1404        if (p >= 0) {
1405            pragmaNormalizeRules(Normalizer.NFD);
1406            return p;
1407        }
1408
1409        p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
1410        if (p >= 0) {
1411            pragmaNormalizeRules(Normalizer.NFC);
1412            return p;
1413        }
1414
1415        // Syntax error: unable to parse pragma
1416        return -1;
1417    }
1418
1419    /**
1420     * Throw an exception indicating a syntax error.  Search the rule string
1421     * for the probable end of the rule.  Of course, if the error is that
1422     * the end of rule marker is missing, then the rule end will not be found.
1423     * In any case the rule start will be correctly reported.
1424     * @param msg error description
1425     * @param rule pattern string
1426     * @param start position of first character of current rule
1427     */
1428    static final void syntaxError(String msg, String rule, int start) {
1429        int end = ruleEnd(rule, start, rule.length());
1430        throw new IllegalIcuArgumentException(msg + " in \"" +
1431                                           Utility.escape(rule.substring(start, end)) + '"');
1432    }
1433
1434    static final int ruleEnd(String rule, int start, int limit) {
1435        int end = Utility.quotedIndexOf(rule, start, limit, ";");
1436        if (end < 0) {
1437            end = limit;
1438        }
1439        return end;
1440    }
1441
1442    /**
1443     * Parse a UnicodeSet out, store it, and return the stand-in character
1444     * used to represent it.
1445     */
1446    private final char parseSet(String rule, ParsePosition pos) {
1447        UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1448        if (variableNext >= variableLimit) {
1449            throw new RuntimeException("Private use variables exhausted");
1450        }
1451        set.compact();
1452        return generateStandInFor(set);
1453    }
1454
1455    /**
1456     * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1457     * Store the object.
1458     */
1459    char generateStandInFor(Object obj) {
1460        // assert(obj != null);
1461
1462        // Look up previous stand-in, if any.  This is a short list
1463        // (typical n is 0, 1, or 2); linear search is optimal.
1464        for (int i=0; i<variablesVector.size(); ++i) {
1465            if (variablesVector.get(i) == obj) { // [sic] pointer comparison
1466                return (char) (curData.variablesBase + i);
1467            }
1468        }
1469
1470        if (variableNext >= variableLimit) {
1471            throw new RuntimeException("Variable range exhausted");
1472        }
1473        variablesVector.add(obj);
1474        return variableNext++;
1475    }
1476
1477    /**
1478     * Return the standin for segment seg (1-based).
1479     */
1480    public char getSegmentStandin(int seg) {
1481        if (segmentStandins.length() < seg) {
1482            segmentStandins.setLength(seg);
1483        }
1484        char c = segmentStandins.charAt(seg-1);
1485        if (c == 0) {
1486            if (variableNext >= variableLimit) {
1487                throw new RuntimeException("Variable range exhausted");
1488            }
1489            c = variableNext++;
1490            // Set a placeholder in the master variables vector that will be
1491            // filled in later by setSegmentObject().  We know that we will get
1492            // called first because setSegmentObject() will call us.
1493            variablesVector.add(null);
1494            segmentStandins.setCharAt(seg-1, c);
1495        }
1496        return c;
1497    }
1498
1499    /**
1500     * Set the object for segment seg (1-based).
1501     */
1502    public void setSegmentObject(int seg, StringMatcher obj) {
1503        // Since we call parseSection() recursively, nested
1504        // segments will result in segment i+1 getting parsed
1505        // and stored before segment i; be careful with the
1506        // vector handling here.
1507        while (segmentObjects.size() < seg) {
1508            segmentObjects.add(null);
1509        }
1510        int index = getSegmentStandin(seg) - curData.variablesBase;
1511        if (segmentObjects.get(seg-1) != null ||
1512            variablesVector.get(index) != null) {
1513            throw new RuntimeException(); // should never happen
1514        }
1515        segmentObjects.set(seg-1, obj);
1516        variablesVector.set(index, obj);
1517    }
1518
1519    /**
1520     * Return the stand-in for the dot set.  It is allocated the first
1521     * time and reused thereafter.
1522     */
1523    char getDotStandIn() {
1524        if (dotStandIn == -1) {
1525            dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1526        }
1527        return (char) dotStandIn;
1528    }
1529
1530    /**
1531     * Append the value of the given variable name to the given
1532     * StringBuffer.
1533     * @exception IllegalIcuArgumentException if the name is unknown.
1534     */
1535    private void appendVariableDef(String name, StringBuffer buf) {
1536        char[] ch = variableNames.get(name);
1537        if (ch == null) {
1538            // We allow one undefined variable so that variable definition
1539            // statements work.  For the first undefined variable we return
1540            // the special placeholder variableLimit-1, and save the variable
1541            // name.
1542            if (undefinedVariableName == null) {
1543                undefinedVariableName = name;
1544                if (variableNext >= variableLimit) {
1545                    throw new RuntimeException("Private use variables exhausted");
1546                }
1547                buf.append(--variableLimit);
1548            } else {
1549                throw new IllegalIcuArgumentException("Undefined variable $"
1550                                                   + name);
1551            }
1552        } else {
1553            buf.append(ch);
1554        }
1555    }
1556}
1557
1558//eof
1559