1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10package android.icu.text;
11
12import java.text.ParsePosition;
13import java.util.HashMap;
14
15import android.icu.impl.Assert;
16import android.icu.impl.Utility;
17import android.icu.lang.UCharacter;
18
19/**
20  *  This class is part of the Rule Based Break Iterator rule compiler.
21  *  It scans the rules and builds the parse tree.
22  *  There is no public API here.
23  */
24class RBBIRuleScanner {
25
26    private final static int    kStackSize = 100;               // The size of the state stack for
27    //   rules parsing.  Corresponds roughly
28    //   to the depth of parentheses nesting
29    //   that is allowed in the rules.
30
31    static class RBBIRuleChar {
32        int             fChar;
33        boolean         fEscaped;
34    }
35
36
37
38    RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
39
40    int                       fScanIndex;        // Index of current character being processed
41                                                     //   in the rule input string.
42    int                       fNextIndex;        // Index of the next character, which
43                                                     //   is the first character not yet scanned.
44    boolean                  fQuoteMode;        // Scan is in a 'quoted region'
45    int                       fLineNum;          // Line number in input file.
46    int                       fCharNum;          // Char position within the line.
47    int                       fLastChar;         // Previous char, needed to count CR-LF
48                                                     //   as a single line, not two.
49
50    RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
51                                                     //   processing.
52
53
54    short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
55    int                       fStackPtr;           //  and pops as specified in the state
56                                                       //  transition rules.
57
58    RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
59                                                           //  during the parse of a rule
60    int                        fNodeStackPtr;
61
62
63    boolean                    fReverseRule;         // True if the rule currently being scanned
64                                                     //  is a reverse direction rule (if it
65                                                     //  starts with a '!')
66
67    boolean                    fLookAheadRule;       // True if the rule includes a '/'
68                                                     //   somewhere within it.
69
70    boolean                    fNoChainInRule;       // True if the current rule starts with a '^'.
71
72
73    RBBISymbolTable            fSymbolTable;         // symbol table, holds definitions of
74                                                     //   $variable symbols.
75
76    HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
77                                                                                       //   the sets created while parsing rules.
78                                                                                       //   The key is the string used for creating
79                                                                                       //   the set.
80
81    UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
82                                                     //  the scanning of RBBI rules.  The
83                                                     //  indicies for these are assigned by the
84                                                     //  perl script that builds the state tables.
85                                                     //  See rbbirpt.h.
86
87    int                        fRuleNum;         // Counts each rule as it is scanned.
88
89    int                        fOptionStart;     // Input index of start of a !!option
90                                                 //   keyword, while being scanned.
91
92
93
94   static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
95   static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
96   static private String gRuleSet_digit_char_pattern      = "[0-9]";
97   static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
98   static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
99   static private String kAny =  "any";
100
101
102
103
104    //----------------------------------------------------------------------------------------
105    //
106    //  Constructor.
107    //
108    //----------------------------------------------------------------------------------------
109    RBBIRuleScanner(RBBIRuleBuilder rb) {
110        fRB = rb;
111        fLineNum = 1;
112
113        //
114        //  Set up the constant Unicode Sets.
115        //     Note: These could be made static and shared among
116        //            all instances of RBBIRuleScanners.
117        fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
118        fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
119        fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
120        fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
121        fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
122
123        fSymbolTable = new RBBISymbolTable(this);
124    }
125
126    //----------------------------------------------------------------------------------------
127    //
128    //  doParseAction Do some action during rule parsing.
129    //                       Called by the parse state machine.
130    //                       Actions build the parse tree and Unicode Sets,
131    //                       and maintain the parse stack for nested expressions.
132    //
133    //----------------------------------------------------------------------------------------
134    boolean doParseActions(int action) {
135        RBBINode n = null;
136
137        boolean returnVal = true;
138
139        switch (action) {
140
141        case RBBIRuleParseTable.doExprStart:
142            pushNewNode(RBBINode.opStart);
143            fRuleNum++;
144            break;
145
146        case RBBIRuleParseTable.doNoChain:
147            // Scanned a '^' while on the rule start state.
148            fNoChainInRule = true;
149            break;
150
151
152        case RBBIRuleParseTable.doExprOrOperator: {
153            fixOpStack(RBBINode.precOpCat);
154            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
155            RBBINode orNode = pushNewNode(RBBINode.opOr);
156            orNode.fLeftChild = operandNode;
157            operandNode.fParent = orNode;
158        }
159            break;
160
161        case RBBIRuleParseTable.doExprCatOperator:
162        // concatenation operator.
163        // For the implicit concatenation of adjacent terms in an expression
164        // that are
165        //   not separated by any other operator. Action is invoked between the
166        //   actions for the two terms.
167        {
168            fixOpStack(RBBINode.precOpCat);
169            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
170            RBBINode catNode = pushNewNode(RBBINode.opCat);
171            catNode.fLeftChild = operandNode;
172            operandNode.fParent = catNode;
173        }
174            break;
175
176        case RBBIRuleParseTable.doLParen:
177            // Open Paren.
178            //   The openParen node is a dummy operation type with a low
179            // precedence,
180            //     which has the affect of ensuring that any real binary op that
181            //     follows within the parens binds more tightly to the operands than
182            //     stuff outside of the parens.
183            pushNewNode(RBBINode.opLParen);
184            break;
185
186        case RBBIRuleParseTable.doExprRParen:
187            fixOpStack(RBBINode.precLParen);
188            break;
189
190        case RBBIRuleParseTable.doNOP:
191            break;
192
193        case RBBIRuleParseTable.doStartAssign:
194            // We've just scanned "$variable = "
195            // The top of the node stack has the $variable ref node.
196
197            // Save the start position of the RHS text in the StartExpression
198            // node
199            //   that precedes the $variableReference node on the stack.
200            //   This will eventually be used when saving the full $variable
201            // replacement
202            //   text as a string.
203            n = fNodeStack[fNodeStackPtr - 1];
204            n.fFirstPos = fNextIndex; // move past the '='
205
206            // Push a new start-of-expression node; needed to keep parse of the
207            //   RHS expression happy.
208            pushNewNode(RBBINode.opStart);
209            break;
210
211        case RBBIRuleParseTable.doEndAssign: {
212            // We have reached the end of an assignement statement.
213            //   Current scan char is the ';' that terminates the assignment.
214
215            // Terminate expression, leaves expression parse tree rooted in TOS
216            // node.
217            fixOpStack(RBBINode.precStart);
218
219            RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
220            RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
221            RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
222
223            // Save original text of right side of assignment, excluding the
224            // terminating ';'
225            //  in the root of the node for the right-hand-side expression.
226            RHSExprNode.fFirstPos = startExprNode.fFirstPos;
227            RHSExprNode.fLastPos = fScanIndex;
228            // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
229            // RHSExprNode.fLastPos, RHSExprNode.fText);
230            RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
231                    RHSExprNode.fLastPos);
232
233            // Expression parse tree becomes l. child of the $variable reference
234            // node.
235            varRefNode.fLeftChild = RHSExprNode;
236            RHSExprNode.fParent = varRefNode;
237
238            // Make a symbol table entry for the $variableRef node.
239            fSymbolTable.addEntry(varRefNode.fText, varRefNode);
240
241            // Clean up the stack.
242            fNodeStackPtr -= 3;
243            break;
244        }
245
246        case RBBIRuleParseTable.doEndOfRule: {
247            fixOpStack(RBBINode.precStart); // Terminate expression, leaves
248                                            // expression
249
250            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
251                printNodeStack("end of rule");
252            }
253            Assert.assrt(fNodeStackPtr == 1);
254            RBBINode thisRule = fNodeStack[fNodeStackPtr];
255
256            // If this rule includes a look-ahead '/', add a endMark node to the
257            //   expression tree.
258            if (fLookAheadRule) {
259                RBBINode endNode = pushNewNode(RBBINode.endMark);
260                RBBINode catNode = pushNewNode(RBBINode.opCat);
261                fNodeStackPtr -= 2;
262                catNode.fLeftChild = thisRule;
263                catNode.fRightChild = endNode;
264                fNodeStack[fNodeStackPtr] = catNode;
265                endNode.fVal = fRuleNum;
266                endNode.fLookAheadEnd = true;
267                thisRule = catNode;
268
269                // TODO: Disable chaining out of look-ahead (hard break) rules.
270                //   The break on rule match is forced, so there is no point in building up
271                //   the state table to chain into another rule for a longer match.
272            }
273
274            // Mark this node as being the root of a rule.
275            thisRule.fRuleRoot = true;
276
277            // Flag if chaining into this rule is wanted.
278            //
279            if (fRB.fChainRules &&          // If rule chaining is enabled globally via !!chain
280                    !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
281                thisRule.fChainIn = true;
282            }
283
284
285            // All rule expressions are ORed together.
286            // The ';' that terminates an expression really just functions as a
287            // '|' with
288            //   a low operator prededence.
289            //
290            // Each of the four sets of rules are collected separately.
291            //  (forward, reverse, safe_forward, safe_reverse)
292            //  OR this rule into the appropriate group of them.
293            //
294
295            int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
296
297            if (fRB.fTreeRoots[destRules] != null) {
298                // This is not the first rule encountered.
299                // OR previous stuff (from *destRules)
300                // with the current rule expression (on the Node Stack)
301                //  with the resulting OR expression going to *destRules
302                //
303                thisRule = fNodeStack[fNodeStackPtr];
304                RBBINode prevRules = fRB.fTreeRoots[destRules];
305                RBBINode orNode = pushNewNode(RBBINode.opOr);
306                orNode.fLeftChild = prevRules;
307                prevRules.fParent = orNode;
308                orNode.fRightChild = thisRule;
309                thisRule.fParent = orNode;
310                fRB.fTreeRoots[destRules] = orNode;
311            } else {
312                // This is the first rule encountered (for this direction).
313                // Just move its parse tree from the stack to *destRules.
314                fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
315            }
316            fReverseRule = false; // in preparation for the next rule.
317            fLookAheadRule = false;
318            fNoChainInRule = false;
319            fNodeStackPtr = 0;
320        }
321            break;
322
323        case RBBIRuleParseTable.doRuleError:
324            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
325            returnVal = false;
326            break;
327
328        case RBBIRuleParseTable.doVariableNameExpectedErr:
329            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
330            break;
331
332        //
333        //  Unary operands + ? *
334        //    These all appear after the operand to which they apply.
335        //    When we hit one, the operand (may be a whole sub expression)
336        //    will be on the top of the stack.
337        //    Unary Operator becomes TOS, with the old TOS as its one child.
338        case RBBIRuleParseTable.doUnaryOpPlus: {
339            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
340            RBBINode plusNode = pushNewNode(RBBINode.opPlus);
341            plusNode.fLeftChild = operandNode;
342            operandNode.fParent = plusNode;
343        }
344            break;
345
346        case RBBIRuleParseTable.doUnaryOpQuestion: {
347            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
348            RBBINode qNode = pushNewNode(RBBINode.opQuestion);
349            qNode.fLeftChild = operandNode;
350            operandNode.fParent = qNode;
351        }
352            break;
353
354        case RBBIRuleParseTable.doUnaryOpStar: {
355            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
356            RBBINode starNode = pushNewNode(RBBINode.opStar);
357            starNode.fLeftChild = operandNode;
358            operandNode.fParent = starNode;
359        }
360            break;
361
362        case RBBIRuleParseTable.doRuleChar:
363        // A "Rule Character" is any single character that is a literal part
364        // of the regular expression. Like a, b and c in the expression "(abc*)
365        // | [:L:]"
366        // These are pretty uncommon in break rules; the terms are more commonly
367        //  sets. To keep things uniform, treat these characters like as
368        // sets that just happen to contain only one character.
369        {
370            n = pushNewNode(RBBINode.setRef);
371            String s = String.valueOf((char)fC.fChar);
372            findSetFor(s, n, null);
373            n.fFirstPos = fScanIndex;
374            n.fLastPos = fNextIndex;
375            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
376            break;
377        }
378
379        case RBBIRuleParseTable.doDotAny:
380        // scanned a ".", meaning match any single character.
381        {
382            n = pushNewNode(RBBINode.setRef);
383            findSetFor(kAny, n, null);
384            n.fFirstPos = fScanIndex;
385            n.fLastPos = fNextIndex;
386            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
387            break;
388        }
389
390        case RBBIRuleParseTable.doSlash:
391            // Scanned a '/', which identifies a look-ahead break position in a
392            // rule.
393            n = pushNewNode(RBBINode.lookAhead);
394            n.fVal = fRuleNum;
395            n.fFirstPos = fScanIndex;
396            n.fLastPos = fNextIndex;
397            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
398            fLookAheadRule = true;
399            break;
400
401        case RBBIRuleParseTable.doStartTagValue:
402            // Scanned a '{', the opening delimiter for a tag value within a
403            // rule.
404            n = pushNewNode(RBBINode.tag);
405            n.fVal = 0;
406            n.fFirstPos = fScanIndex;
407            n.fLastPos = fNextIndex;
408            break;
409
410        case RBBIRuleParseTable.doTagDigit:
411        // Just scanned a decimal digit that's part of a tag value
412        {
413            n = fNodeStack[fNodeStackPtr];
414            int v = UCharacter.digit((char) fC.fChar, 10);
415            n.fVal = n.fVal * 10 + v;
416            break;
417        }
418
419        case RBBIRuleParseTable.doTagValue:
420            n = fNodeStack[fNodeStackPtr];
421            n.fLastPos = fNextIndex;
422            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
423            break;
424
425        case RBBIRuleParseTable.doTagExpectedError:
426            error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
427            returnVal = false;
428            break;
429
430        case RBBIRuleParseTable.doOptionStart:
431            // Scanning a !!option. At the start of string.
432            fOptionStart = fScanIndex;
433            break;
434
435        case RBBIRuleParseTable.doOptionEnd: {
436            String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
437            if (opt.equals("chain")) {
438                fRB.fChainRules = true;
439            } else if (opt.equals("LBCMNoChain")) {
440                fRB.fLBCMNoChain = true;
441            } else if (opt.equals("forward")) {
442                fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
443            } else if (opt.equals("reverse")) {
444                fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
445            } else if (opt.equals("safe_forward")) {
446                fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
447            } else if (opt.equals("safe_reverse")) {
448                fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
449            } else if (opt.equals("lookAheadHardBreak")) {
450                fRB.fLookAheadHardBreak = true;
451            } else {
452                error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
453            }
454            break;
455        }
456
457        case RBBIRuleParseTable.doReverseDir:
458            fReverseRule = true;
459            break;
460
461        case RBBIRuleParseTable.doStartVariableName:
462            n = pushNewNode(RBBINode.varRef);
463            n.fFirstPos = fScanIndex;
464            break;
465
466        case RBBIRuleParseTable.doEndVariableName:
467            n = fNodeStack[fNodeStackPtr];
468            if (n == null || n.fType != RBBINode.varRef) {
469                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
470                break;
471            }
472            n.fLastPos = fScanIndex;
473            n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
474            // Look the newly scanned name up in the symbol table
475            //   If there's an entry, set the l. child of the var ref to the
476            // replacement expression.
477            //   (We also pass through here when scanning assignments, but no harm
478            // is done, other
479            //    than a slight wasted effort that seems hard to avoid. Lookup will
480            // be null)
481            n.fLeftChild = fSymbolTable.lookupNode(n.fText);
482            break;
483
484        case RBBIRuleParseTable.doCheckVarDef:
485            n = fNodeStack[fNodeStackPtr];
486            if (n.fLeftChild == null) {
487                error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
488                returnVal = false;
489            }
490            break;
491
492        case RBBIRuleParseTable.doExprFinished:
493            break;
494
495        case RBBIRuleParseTable.doRuleErrorAssignExpr:
496            error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
497            returnVal = false;
498            break;
499
500        case RBBIRuleParseTable.doExit:
501            returnVal = false;
502            break;
503
504        case RBBIRuleParseTable.doScanUnicodeSet:
505            scanSet();
506            break;
507
508        default:
509            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
510            returnVal = false;
511            break;
512        }
513        return returnVal;
514    }
515
516    //----------------------------------------------------------------------------------------
517    //
518    //  Error Throw and IllegalArgumentException in response to a rule parse
519    // error.
520    //
521    //----------------------------------------------------------------------------------------
522    void error(int e) {
523        String s = "Error " + e + " at line " + fLineNum + " column "
524                + fCharNum;
525        IllegalArgumentException ex = new IllegalArgumentException(s);
526        throw ex;
527
528    }
529
530    //----------------------------------------------------------------------------------------
531    //
532    //  fixOpStack The parse stack holds partially assembled chunks of the parse
533    // tree.
534    //               An entry on the stack may be as small as a single setRef node,
535    //               or as large as the parse tree
536    //               for an entire expression (this will be the one item left on the stack
537    //               when the parsing of an RBBI rule completes.
538    //
539    //               This function is called when a binary operator is encountered.
540    //               It looks back up the stack for operators that are not yet associated
541    //               with a right operand, and if the precedence of the stacked operator >=
542    //               the precedence of the current operator, binds the operand left,
543    //               to the previously encountered operator.
544    //
545    //----------------------------------------------------------------------------------------
546    void fixOpStack(int p) {
547        RBBINode n;
548        // printNodeStack("entering fixOpStack()");
549        for (;;) {
550            n = fNodeStack[fNodeStackPtr - 1]; // an operator node
551            if (n.fPrecedence == 0) {
552                System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
553                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
554                return;
555            }
556
557            if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
558                // The most recent operand goes with the current operator,
559                //   not with the previously stacked one.
560                break;
561            }
562            // Stack operator is a binary op ( '|' or concatenation)
563            //   TOS operand becomes right child of this operator.
564            //   Resulting subexpression becomes the TOS operand.
565            n.fRightChild = fNodeStack[fNodeStackPtr];
566            fNodeStack[fNodeStackPtr].fParent = n;
567            fNodeStackPtr--;
568            // printNodeStack("looping in fixOpStack() ");
569        }
570
571        if (p <= RBBINode.precLParen) {
572            // Scan is at a right paren or end of expression.
573            //  The scanned item must match the stack, or else there was an
574            // error.
575            //  Discard the left paren (or start expr) node from the stack,
576            //  leaving the completed (sub)expression as TOS.
577            if (n.fPrecedence != p) {
578                // Right paren encountered matched start of expression node, or
579                // end of expression matched with a left paren node.
580                error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
581            }
582            fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
583            fNodeStackPtr--;
584            // Delete the now-discarded LParen or Start node.
585            // delete n;
586        }
587        // printNodeStack("leaving fixOpStack()");
588    }
589
590    //----------------------------------------------------------------------------
591    //
592    //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
593    //                        been encountered. The val Node will be of nodetype uset
594    //                        and contain pointers to the actual UnicodeSets.
595    //                        The Key is the source string for initializing the set.
596    //
597    //                        The hash table is used to avoid creating duplicate
598    //                        unnamed (not $var references) UnicodeSets.
599    //
600    //----------------------------------------------------------------------------
601    static class RBBISetTableEl {
602        String key;
603
604        RBBINode val;
605    }
606
607
608    //----------------------------------------------------------------------------------------
609    //
610    //   findSetFor given a String,
611    //                  - find the corresponding Unicode Set (uset node)
612    //                         (create one if necessary)
613    //                  - Set fLeftChild of the caller's node (should be a setRef node)
614    //                         to the uset node
615    //                 Maintain a hash table of uset nodes, so the same one is always used
616    //                    for the same string.
617    //                 If a "to adopt" set is provided and we haven't seen this key before,
618    //                    add the provided set to the hash table.
619    //                 If the string is one (32 bit) char in length, the set contains
620    //                    just one element which is the char in question.
621    //                 If the string is "any", return a set containing all chars.
622    //
623    //----------------------------------------------------------------------------------------
624    void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
625
626        RBBISetTableEl el;
627
628        // First check whether we've already cached a set for this string.
629        // If so, just use the cached set in the new node.
630        //   delete any set provided by the caller, since we own it.
631        el = fSetTable.get(s);
632        if (el != null) {
633            node.fLeftChild = el.val;
634            Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
635            return;
636        }
637
638        // Haven't seen this set before.
639        // If the caller didn't provide us with a prebuilt set,
640        //   create a new UnicodeSet now.
641        if (setToAdopt == null) {
642            if (s.equals(kAny)) {
643                setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
644            } else {
645                int c;
646                c = UTF16.charAt(s, 0);
647                setToAdopt = new UnicodeSet(c, c);
648            }
649        }
650
651        //
652        // Make a new uset node to refer to this UnicodeSet
653        // This new uset node becomes the child of the caller's setReference
654        // node.
655        //
656        RBBINode usetNode = new RBBINode(RBBINode.uset);
657        usetNode.fInputSet = setToAdopt;
658        usetNode.fParent = node;
659        node.fLeftChild = usetNode;
660        usetNode.fText = s;
661
662        //
663        // Add the new uset node to the list of all uset nodes.
664        //
665        fRB.fUSetNodes.add(usetNode);
666
667        //
668        // Add the new set to the set hash table.
669        //
670        el = new RBBISetTableEl();
671        el.key = s;
672        el.val = usetNode;
673        fSetTable.put(el.key, el);
674
675        return;
676    }
677
678    //
679    //  Assorted Unicode character constants.
680    //     Numeric because there is no portable way to enter them as literals.
681    //     (Think EBCDIC).
682    //
683    static final int chNEL = 0x85; //    NEL newline variant
684
685    static final int chLS = 0x2028; //    Unicode Line Separator
686
687    //----------------------------------------------------------------------------------------
688    //
689    //  stripRules    Return a rules string without unnecessary
690    //                characters.
691    //
692    //----------------------------------------------------------------------------------------
693    static String stripRules(String rules) {
694        StringBuilder strippedRules = new StringBuilder();
695        int rulesLength = rules.length();
696        for (int idx = 0; idx < rulesLength;) {
697            char ch = rules.charAt(idx++);
698            if (ch == '#') {
699                while (idx < rulesLength
700                        && ch != '\r' && ch != '\n' && ch != chNEL) {
701                    ch = rules.charAt(idx++);
702                }
703            }
704            if (!UCharacter.isISOControl(ch)) {
705                strippedRules.append(ch);
706            }
707        }
708        return strippedRules.toString();
709    }
710
711    //----------------------------------------------------------------------------------------
712    //
713    //  nextCharLL    Low Level Next Char from rule input source.
714    //                Get a char from the input character iterator,
715    //                keep track of input position for error reporting.
716    //
717    //----------------------------------------------------------------------------------------
718    int nextCharLL() {
719        int ch;
720
721        if (fNextIndex >= fRB.fRules.length()) {
722            return -1;
723        }
724        ch = UTF16.charAt(fRB.fRules, fNextIndex);
725        fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
726
727        if (ch == '\r' ||
728            ch == chNEL ||
729            ch == chLS ||
730            ch == '\n' && fLastChar != '\r') {
731            // Character is starting a new line.  Bump up the line number, and
732            //  reset the column to 0.
733            fLineNum++;
734            fCharNum = 0;
735            if (fQuoteMode) {
736                error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
737                fQuoteMode = false;
738            }
739        } else {
740            // Character is not starting a new line.  Except in the case of a
741            //   LF following a CR, increment the column position.
742            if (ch != '\n') {
743                fCharNum++;
744            }
745        }
746        fLastChar = ch;
747        return ch;
748    }
749
750    //---------------------------------------------------------------------------------
751    //
752    //   nextChar     for rules scanning.  At this level, we handle stripping
753    //                out comments and processing backslash character escapes.
754    //                The rest of the rules grammar is handled at the next level up.
755    //
756    //---------------------------------------------------------------------------------
757    void nextChar(RBBIRuleChar c) {
758
759        // Unicode Character constants needed for the processing done by nextChar(),
760        //   in hex because literals wont work on EBCDIC machines.
761
762        fScanIndex = fNextIndex;
763        c.fChar = nextCharLL();
764        c.fEscaped = false;
765
766        //
767        //  check for '' sequence.
768        //  These are recognized in all contexts, whether in quoted text or not.
769        //
770        if (c.fChar == '\'') {
771            if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
772                c.fChar = nextCharLL(); // get nextChar officially so character counts
773                c.fEscaped = true; //   stay correct.
774            } else {
775                // Single quote, by itself.
776                //   Toggle quoting mode.
777                //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
778                fQuoteMode = !fQuoteMode;
779                if (fQuoteMode == true) {
780                    c.fChar = '(';
781                } else {
782                    c.fChar = ')';
783                }
784                c.fEscaped = false; // The paren that we return is not escaped.
785                return;
786            }
787        }
788
789        if (fQuoteMode) {
790            c.fEscaped = true;
791        } else {
792            // We are not in a 'quoted region' of the source.
793            //
794            if (c.fChar == '#') {
795                // Start of a comment.  Consume the rest of it.
796                //  The new-line char that terminates the comment is always returned.
797                //  It will be treated as white-space, and serves to break up anything
798                //    that might otherwise incorrectly clump together with a comment in
799                //    the middle (a variable name, for example.)
800                for (;;) {
801                    c.fChar = nextCharLL();
802                    if (c.fChar == -1 || // EOF
803                        c.fChar == '\r' ||
804                        c.fChar == '\n' ||
805                        c.fChar == chNEL ||
806                        c.fChar == chLS)
807                    {
808                        break;
809                    }
810                }
811            }
812            if (c.fChar == -1) {
813                return;
814            }
815
816            //
817            //  check for backslash escaped characters.
818            //  Use String.unescapeAt() to handle them.
819            //
820            if (c.fChar == '\\') {
821                c.fEscaped = true;
822                int[] unescapeIndex = new int[1];
823                unescapeIndex[0] = fNextIndex;
824                c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
825                if (unescapeIndex[0] == fNextIndex) {
826                    error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
827                }
828
829                fCharNum += unescapeIndex[0] - fNextIndex;
830                fNextIndex = unescapeIndex[0];
831            }
832        }
833        // putc(c.fChar, stdout);
834    }
835
836    //---------------------------------------------------------------------------------
837    //
838    //  Parse RBBI rules.   The state machine for rules parsing is here.
839    //                      The state tables are hand-written in the file rbbirpt.txt,
840    //                      and converted to the form used here by a perl
841    //                      script rbbicst.pl
842    //
843    //---------------------------------------------------------------------------------
844    void parse() {
845        int state;
846        RBBIRuleParseTable.RBBIRuleTableElement tableEl;
847
848        state = 1;
849        nextChar(fC);
850        //
851        // Main loop for the rule parsing state machine.
852        //   Runs once per state transition.
853        //   Each time through optionally performs, depending on the state table,
854        //      - an advance to the the next input char
855        //      - an action to be performed.
856        //      - pushing or popping a state to/from the local state return stack.
857        //
858        for (;;) {
859            // Quit if state == 0.  This is the normal way to exit the state machine.
860            //
861            if (state == 0) {
862                break;
863            }
864
865            // Find the state table element that matches the input char from the rule, or the
866            //    class of the input character.  Start with the first table row for this
867            //    state, then linearly scan forward until we find a row that matches the
868            //    character.  The last row for each state always matches all characters, so
869            //    the search will stop there, if not before.
870            //
871            tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
872            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
873                System.out.println("char, line, col = (\'" + (char) fC.fChar
874                        + "\', " + fLineNum + ", " + fCharNum + "    state = "
875                        + tableEl.fStateName);
876            }
877
878            for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
879                tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
880                if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
881                    System.out.print(".");
882                }
883                if (tableEl.fCharClass < 127 && fC.fEscaped == false
884                        && tableEl.fCharClass == fC.fChar) {
885                    // Table row specified an individual character, not a set, and
886                    //   the input character is not escaped, and
887                    //   the input character matched it.
888                    break;
889                }
890                if (tableEl.fCharClass == 255) {
891                    // Table row specified default, match anything character class.
892                    break;
893                }
894                if (tableEl.fCharClass == 254 && fC.fEscaped) {
895                    // Table row specified "escaped" and the char was escaped.
896                    break;
897                }
898                if (tableEl.fCharClass == 253 && fC.fEscaped
899                        && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
900                    // Table row specified "escaped P" and the char is either 'p' or 'P'.
901                    break;
902                }
903                if (tableEl.fCharClass == 252 && fC.fChar == -1) {
904                    // Table row specified eof and we hit eof on the input.
905                    break;
906                }
907
908                if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
909                        fC.fEscaped == false && //   char is not escaped &&
910                        fC.fChar != -1) { //   char is not EOF
911                    UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
912                    if (uniset.contains(fC.fChar)) {
913                        // Table row specified a character class, or set of characters,
914                        //   and the current char matches it.
915                        break;
916                    }
917                }
918            }
919
920            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
921                System.out.println("");
922            }
923            //
924            // We've found the row of the state table that matches the current input
925            //   character from the rules string.
926            // Perform any action specified  by this row in the state table.
927            if (doParseActions(tableEl.fAction) == false) {
928                // Break out of the state machine loop if the
929                //   the action signalled some kind of error, or
930                //   the action was to exit, occurs on normal end-of-rules-input.
931                break;
932            }
933
934            if (tableEl.fPushState != 0) {
935                fStackPtr++;
936                if (fStackPtr >= kStackSize) {
937                    System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
938                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
939                }
940                fStack[fStackPtr] = tableEl.fPushState;
941            }
942
943            if (tableEl.fNextChar) {
944                nextChar(fC);
945            }
946
947            // Get the next state from the table entry, or from the
948            //   state stack if the next state was specified as "pop".
949            if (tableEl.fNextState != 255) {
950                state = tableEl.fNextState;
951            } else {
952                state = fStack[fStackPtr];
953                fStackPtr--;
954                if (fStackPtr < 0) {
955                    System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
956                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
957                }
958            }
959
960        }
961
962        // If there are no forward rules throw an error.
963        //
964        if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
965            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
966        }
967
968        //
969        // If there were NO user specified reverse rules, set up the equivalent of ".*;"
970        //
971        if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
972            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
973            RBBINode operand = pushNewNode(RBBINode.setRef);
974            findSetFor(kAny, operand, null);
975            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
976            operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
977            fNodeStackPtr -= 2;
978        }
979
980        //
981        // Parsing of the input RBBI rules is complete.
982        // We now have a parse tree for the rule expressions
983        // and a list of all UnicodeSets that are referenced.
984        //
985        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
986            fSymbolTable.rbbiSymtablePrint();
987        }
988        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
989            System.out.println("Completed Forward Rules Parse Tree...");
990            fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
991            System.out.println("\nCompleted Reverse Rules Parse Tree...");
992            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
993            System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
994            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
995                System.out.println("  -- null -- ");
996            } else {
997                fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
998            }
999            System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
1000            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
1001                System.out.println("  -- null -- ");
1002            } else {
1003                fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
1004            }
1005        }
1006    }
1007
1008    //---------------------------------------------------------------------------------
1009    //
1010    //  printNodeStack     for debugging...
1011    //
1012    //---------------------------------------------------------------------------------
1013    ///CLOVER:OFF
1014    void printNodeStack(String title) {
1015        int i;
1016        System.out.println(title + ".  Dumping node stack...\n");
1017        for (i = fNodeStackPtr; i > 0; i--) {
1018            fNodeStack[i].printTree(true);
1019        }
1020    }
1021    ///CLOVER:ON
1022
1023    //---------------------------------------------------------------------------------
1024    //
1025    //  pushNewNode   create a new RBBINode of the specified type and push it
1026    //                onto the stack of nodes.
1027    //
1028    //---------------------------------------------------------------------------------
1029    RBBINode pushNewNode(int nodeType) {
1030        fNodeStackPtr++;
1031        if (fNodeStackPtr >= kStackSize) {
1032            System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1033            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1034        }
1035        fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1036        return fNodeStack[fNodeStackPtr];
1037    }
1038
1039    //---------------------------------------------------------------------------------
1040    //
1041    //  scanSet    Construct a UnicodeSet from the text at the current scan
1042    //             position.  Advance the scan position to the first character
1043    //             after the set.
1044    //
1045    //             A new RBBI setref node referring to the set is pushed onto the node
1046    //             stack.
1047    //
1048    //             The scan position is normally under the control of the state machine
1049    //             that controls rule parsing.  UnicodeSets, however, are parsed by
1050    //             the UnicodeSet constructor, not by the RBBI rule parser.
1051    //
1052    //---------------------------------------------------------------------------------
1053    void scanSet() {
1054        UnicodeSet uset = null;
1055        int startPos;
1056        ParsePosition pos = new ParsePosition(fScanIndex);
1057        int i;
1058
1059        startPos = fScanIndex;
1060        try {
1061            uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1062        } catch (Exception e) { // TODO:  catch fewer exception types.
1063            // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1064            error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1065        }
1066
1067        // Verify that the set contains at least one code point.
1068        //
1069        if (uset.isEmpty()) {
1070            // This set is empty.
1071            //  Make it an error, because it almost certainly is not what the user wanted.
1072            //  Also, avoids having to think about corner cases in the tree manipulation code
1073            //   that occurs later on.
1074            //  TODO:  this shouldn't be an error; it does happen.
1075            error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1076        }
1077
1078        // Advance the RBBI parse postion over the UnicodeSet pattern.
1079        //   Don't just set fScanIndex because the line/char positions maintained
1080        //   for error reporting would be thrown off.
1081        i = pos.getIndex();
1082        for (;;) {
1083            if (fNextIndex >= i) {
1084                break;
1085            }
1086            nextCharLL();
1087        }
1088
1089        RBBINode n;
1090
1091        n = pushNewNode(RBBINode.setRef);
1092        n.fFirstPos = startPos;
1093        n.fLastPos = fNextIndex;
1094        n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1095        //  findSetFor() serves several purposes here:
1096        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1097        //     - Mantains collection of all sets in use, needed later for establishing
1098        //          character categories for run time engine.
1099        //     - Eliminates mulitiple instances of the same set.
1100        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1101        findSetFor(n.fText, n, uset);
1102    }
1103
1104}
1105
1106