1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
6 *******************************************************************************
7 */
8
9package com.ibm.icu.text;
10
11import java.text.ParsePosition;
12import java.util.HashMap;
13
14import com.ibm.icu.impl.Assert;
15import com.ibm.icu.impl.Utility;
16import com.ibm.icu.lang.UCharacter;
17
18/**
19  *  This class is part of the Rule Based Break Iterator rule compiler.
20  *  It scans the rules and builds the parse tree.
21  *  There is no public API here.
22  */
23class RBBIRuleScanner {
24
25    private final static int    kStackSize = 100;               // The size of the state stack for
26    //   rules parsing.  Corresponds roughly
27    //   to the depth of parentheses nesting
28    //   that is allowed in the rules.
29
30    static class RBBIRuleChar {
31        int             fChar;
32        boolean         fEscaped;
33    }
34
35
36
37    RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
38
39    int                       fScanIndex;        // Index of current character being processed
40                                                     //   in the rule input string.
41    int                       fNextIndex;        // Index of the next character, which
42                                                     //   is the first character not yet scanned.
43    boolean                  fQuoteMode;        // Scan is in a 'quoted region'
44    int                       fLineNum;          // Line number in input file.
45    int                       fCharNum;          // Char position within the line.
46    int                       fLastChar;         // Previous char, needed to count CR-LF
47                                                     //   as a single line, not two.
48
49    RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
50                                                     //   processing.
51
52
53    short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
54    int                       fStackPtr;           //  and pops as specified in the state
55                                                       //  transition rules.
56
57    RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
58                                                           //  during the parse of a rule
59    int                        fNodeStackPtr;
60
61
62    boolean                    fReverseRule;         // True if the rule currently being scanned
63                                                     //  is a reverse direction rule (if it
64                                                     //  starts with a '!')
65
66    boolean                    fLookAheadRule;       // True if the rule includes a '/'
67                                                     //   somewhere within it.
68
69    boolean                    fNoChainInRule;       // True if the current rule starts with a '^'.
70
71
72    RBBISymbolTable            fSymbolTable;         // symbol table, holds definitions of
73                                                     //   $variable symbols.
74
75    HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
76                                                                                       //   the sets created while parsing rules.
77                                                                                       //   The key is the string used for creating
78                                                                                       //   the set.
79
80    UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
81                                                     //  the scanning of RBBI rules.  The
82                                                     //  Indices for these are assigned by the
83                                                     //  perl script that builds the state tables.
84                                                     //  See rbbirpt.h.
85
86    int                        fRuleNum;         // Counts each rule as it is scanned.
87
88    int                        fOptionStart;     // Input index of start of a !!option
89                                                 //   keyword, while being scanned.
90
91
92   // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting.
93   static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
94   static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
95   static private String gRuleSet_digit_char_pattern      = "[0-9]";
96   static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
97   static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
98   static private String kAny =  "any";
99
100
101
102
103    //----------------------------------------------------------------------------------------
104    //
105    //  Constructor.
106    //
107    //----------------------------------------------------------------------------------------
108    RBBIRuleScanner(RBBIRuleBuilder rb) {
109        fRB = rb;
110        fLineNum = 1;
111
112        //
113        //  Set up the constant Unicode Sets.
114        //     Note: These could be made static and shared among
115        //            all instances of RBBIRuleScanners.
116        fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
117        fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
118        fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
119        fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
120        fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
121
122        fSymbolTable = new RBBISymbolTable(this);
123    }
124
125    //----------------------------------------------------------------------------------------
126    //
127    //  doParseAction Do some action during rule parsing.
128    //                       Called by the parse state machine.
129    //                       Actions build the parse tree and Unicode Sets,
130    //                       and maintain the parse stack for nested expressions.
131    //
132    //----------------------------------------------------------------------------------------
133    boolean doParseActions(int action) {
134        RBBINode n = null;
135
136        boolean returnVal = true;
137
138        switch (action) {
139
140        case RBBIRuleParseTable.doExprStart:
141            pushNewNode(RBBINode.opStart);
142            fRuleNum++;
143            break;
144
145        case RBBIRuleParseTable.doNoChain:
146            // Scanned a '^' while on the rule start state.
147            fNoChainInRule = true;
148            break;
149
150
151        case RBBIRuleParseTable.doExprOrOperator: {
152            fixOpStack(RBBINode.precOpCat);
153            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
154            RBBINode orNode = pushNewNode(RBBINode.opOr);
155            orNode.fLeftChild = operandNode;
156            operandNode.fParent = orNode;
157        }
158            break;
159
160        case RBBIRuleParseTable.doExprCatOperator:
161        // concatenation operator.
162        // For the implicit concatenation of adjacent terms in an expression
163        // that are
164        //   not separated by any other operator. Action is invoked between the
165        //   actions for the two terms.
166        {
167            fixOpStack(RBBINode.precOpCat);
168            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
169            RBBINode catNode = pushNewNode(RBBINode.opCat);
170            catNode.fLeftChild = operandNode;
171            operandNode.fParent = catNode;
172        }
173            break;
174
175        case RBBIRuleParseTable.doLParen:
176            // Open Paren.
177            //   The openParen node is a dummy operation type with a low
178            // precedence,
179            //     which has the affect of ensuring that any real binary op that
180            //     follows within the parens binds more tightly to the operands than
181            //     stuff outside of the parens.
182            pushNewNode(RBBINode.opLParen);
183            break;
184
185        case RBBIRuleParseTable.doExprRParen:
186            fixOpStack(RBBINode.precLParen);
187            break;
188
189        case RBBIRuleParseTable.doNOP:
190            break;
191
192        case RBBIRuleParseTable.doStartAssign:
193            // We've just scanned "$variable = "
194            // The top of the node stack has the $variable ref node.
195
196            // Save the start position of the RHS text in the StartExpression
197            // node
198            //   that precedes the $variableReference node on the stack.
199            //   This will eventually be used when saving the full $variable
200            // replacement
201            //   text as a string.
202            n = fNodeStack[fNodeStackPtr - 1];
203            n.fFirstPos = fNextIndex; // move past the '='
204
205            // Push a new start-of-expression node; needed to keep parse of the
206            //   RHS expression happy.
207            pushNewNode(RBBINode.opStart);
208            break;
209
210        case RBBIRuleParseTable.doEndAssign: {
211            // We have reached the end of an assignement statement.
212            //   Current scan char is the ';' that terminates the assignment.
213
214            // Terminate expression, leaves expression parse tree rooted in TOS
215            // node.
216            fixOpStack(RBBINode.precStart);
217
218            RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
219            RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
220            RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
221
222            // Save original text of right side of assignment, excluding the
223            // terminating ';'
224            //  in the root of the node for the right-hand-side expression.
225            RHSExprNode.fFirstPos = startExprNode.fFirstPos;
226            RHSExprNode.fLastPos = fScanIndex;
227            // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
228            // RHSExprNode.fLastPos, RHSExprNode.fText);
229            RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
230                    RHSExprNode.fLastPos);
231
232            // Expression parse tree becomes l. child of the $variable reference
233            // node.
234            varRefNode.fLeftChild = RHSExprNode;
235            RHSExprNode.fParent = varRefNode;
236
237            // Make a symbol table entry for the $variableRef node.
238            fSymbolTable.addEntry(varRefNode.fText, varRefNode);
239
240            // Clean up the stack.
241            fNodeStackPtr -= 3;
242            break;
243        }
244
245        case RBBIRuleParseTable.doEndOfRule: {
246            fixOpStack(RBBINode.precStart); // Terminate expression, leaves
247                                            // expression
248
249            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
250                printNodeStack("end of rule");
251            }
252            Assert.assrt(fNodeStackPtr == 1);
253            RBBINode thisRule = fNodeStack[fNodeStackPtr];
254
255            // If this rule includes a look-ahead '/', add a endMark node to the
256            //   expression tree.
257            if (fLookAheadRule) {
258                RBBINode endNode = pushNewNode(RBBINode.endMark);
259                RBBINode catNode = pushNewNode(RBBINode.opCat);
260                fNodeStackPtr -= 2;
261                catNode.fLeftChild = thisRule;
262                catNode.fRightChild = endNode;
263                fNodeStack[fNodeStackPtr] = catNode;
264                endNode.fVal = fRuleNum;
265                endNode.fLookAheadEnd = true;
266                thisRule = catNode;
267
268                // TODO: Disable chaining out of look-ahead (hard break) rules.
269                //   The break on rule match is forced, so there is no point in building up
270                //   the state table to chain into another rule for a longer match.
271            }
272
273            // Mark this node as being the root of a rule.
274            thisRule.fRuleRoot = true;
275
276            // Flag if chaining into this rule is wanted.
277            //
278            if (fRB.fChainRules &&          // If rule chaining is enabled globally via !!chain
279                    !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
280                thisRule.fChainIn = true;
281            }
282
283
284            // All rule expressions are ORed together.
285            // The ';' that terminates an expression really just functions as a
286            // '|' with
287            //   a low operator prededence.
288            //
289            // Each of the four sets of rules are collected separately.
290            //  (forward, reverse, safe_forward, safe_reverse)
291            //  OR this rule into the appropriate group of them.
292            //
293
294            int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
295
296            if (fRB.fTreeRoots[destRules] != null) {
297                // This is not the first rule encountered.
298                // OR previous stuff (from *destRules)
299                // with the current rule expression (on the Node Stack)
300                //  with the resulting OR expression going to *destRules
301                //
302                thisRule = fNodeStack[fNodeStackPtr];
303                RBBINode prevRules = fRB.fTreeRoots[destRules];
304                RBBINode orNode = pushNewNode(RBBINode.opOr);
305                orNode.fLeftChild = prevRules;
306                prevRules.fParent = orNode;
307                orNode.fRightChild = thisRule;
308                thisRule.fParent = orNode;
309                fRB.fTreeRoots[destRules] = orNode;
310            } else {
311                // This is the first rule encountered (for this direction).
312                // Just move its parse tree from the stack to *destRules.
313                fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
314            }
315            fReverseRule = false; // in preparation for the next rule.
316            fLookAheadRule = false;
317            fNoChainInRule = false;
318            fNodeStackPtr = 0;
319        }
320            break;
321
322        case RBBIRuleParseTable.doRuleError:
323            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
324            returnVal = false;
325            break;
326
327        case RBBIRuleParseTable.doVariableNameExpectedErr:
328            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
329            break;
330
331        //
332        //  Unary operands + ? *
333        //    These all appear after the operand to which they apply.
334        //    When we hit one, the operand (may be a whole sub expression)
335        //    will be on the top of the stack.
336        //    Unary Operator becomes TOS, with the old TOS as its one child.
337        case RBBIRuleParseTable.doUnaryOpPlus: {
338            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
339            RBBINode plusNode = pushNewNode(RBBINode.opPlus);
340            plusNode.fLeftChild = operandNode;
341            operandNode.fParent = plusNode;
342        }
343            break;
344
345        case RBBIRuleParseTable.doUnaryOpQuestion: {
346            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
347            RBBINode qNode = pushNewNode(RBBINode.opQuestion);
348            qNode.fLeftChild = operandNode;
349            operandNode.fParent = qNode;
350        }
351            break;
352
353        case RBBIRuleParseTable.doUnaryOpStar: {
354            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
355            RBBINode starNode = pushNewNode(RBBINode.opStar);
356            starNode.fLeftChild = operandNode;
357            operandNode.fParent = starNode;
358        }
359            break;
360
361        case RBBIRuleParseTable.doRuleChar:
362        // A "Rule Character" is any single character that is a literal part
363        // of the regular expression. Like a, b and c in the expression "(abc*)
364        // | [:L:]"
365        // These are pretty uncommon in break rules; the terms are more commonly
366        //  sets. To keep things uniform, treat these characters like as
367        // sets that just happen to contain only one character.
368        {
369            n = pushNewNode(RBBINode.setRef);
370            String s = String.valueOf((char)fC.fChar);
371            findSetFor(s, n, null);
372            n.fFirstPos = fScanIndex;
373            n.fLastPos = fNextIndex;
374            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
375            break;
376        }
377
378        case RBBIRuleParseTable.doDotAny:
379        // scanned a ".", meaning match any single character.
380        {
381            n = pushNewNode(RBBINode.setRef);
382            findSetFor(kAny, n, null);
383            n.fFirstPos = fScanIndex;
384            n.fLastPos = fNextIndex;
385            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
386            break;
387        }
388
389        case RBBIRuleParseTable.doSlash:
390            // Scanned a '/', which identifies a look-ahead break position in a
391            // rule.
392            n = pushNewNode(RBBINode.lookAhead);
393            n.fVal = fRuleNum;
394            n.fFirstPos = fScanIndex;
395            n.fLastPos = fNextIndex;
396            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
397            fLookAheadRule = true;
398            break;
399
400        case RBBIRuleParseTable.doStartTagValue:
401            // Scanned a '{', the opening delimiter for a tag value within a
402            // rule.
403            n = pushNewNode(RBBINode.tag);
404            n.fVal = 0;
405            n.fFirstPos = fScanIndex;
406            n.fLastPos = fNextIndex;
407            break;
408
409        case RBBIRuleParseTable.doTagDigit:
410        // Just scanned a decimal digit that's part of a tag value
411        {
412            n = fNodeStack[fNodeStackPtr];
413            int v = UCharacter.digit((char) fC.fChar, 10);
414            n.fVal = n.fVal * 10 + v;
415            break;
416        }
417
418        case RBBIRuleParseTable.doTagValue:
419            n = fNodeStack[fNodeStackPtr];
420            n.fLastPos = fNextIndex;
421            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
422            break;
423
424        case RBBIRuleParseTable.doTagExpectedError:
425            error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
426            returnVal = false;
427            break;
428
429        case RBBIRuleParseTable.doOptionStart:
430            // Scanning a !!option. At the start of string.
431            fOptionStart = fScanIndex;
432            break;
433
434        case RBBIRuleParseTable.doOptionEnd: {
435            String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
436            if (opt.equals("chain")) {
437                fRB.fChainRules = true;
438            } else if (opt.equals("LBCMNoChain")) {
439                fRB.fLBCMNoChain = true;
440            } else if (opt.equals("forward")) {
441                fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
442            } else if (opt.equals("reverse")) {
443                fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
444            } else if (opt.equals("safe_forward")) {
445                fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
446            } else if (opt.equals("safe_reverse")) {
447                fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
448            } else if (opt.equals("lookAheadHardBreak")) {
449                fRB.fLookAheadHardBreak = true;
450            } else if (opt.equals("quoted_literals_only")) {
451                fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear();
452            } else if (opt.equals("unquoted_literals")) {
453                fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern);
454            } else {
455                error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
456            }
457            break;
458        }
459
460        case RBBIRuleParseTable.doReverseDir:
461            fReverseRule = true;
462            break;
463
464        case RBBIRuleParseTable.doStartVariableName:
465            n = pushNewNode(RBBINode.varRef);
466            n.fFirstPos = fScanIndex;
467            break;
468
469        case RBBIRuleParseTable.doEndVariableName:
470            n = fNodeStack[fNodeStackPtr];
471            if (n == null || n.fType != RBBINode.varRef) {
472                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
473                break;
474            }
475            n.fLastPos = fScanIndex;
476            n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
477            // Look the newly scanned name up in the symbol table
478            //   If there's an entry, set the l. child of the var ref to the
479            // replacement expression.
480            //   (We also pass through here when scanning assignments, but no harm
481            // is done, other
482            //    than a slight wasted effort that seems hard to avoid. Lookup will
483            // be null)
484            n.fLeftChild = fSymbolTable.lookupNode(n.fText);
485            break;
486
487        case RBBIRuleParseTable.doCheckVarDef:
488            n = fNodeStack[fNodeStackPtr];
489            if (n.fLeftChild == null) {
490                error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
491                returnVal = false;
492            }
493            break;
494
495        case RBBIRuleParseTable.doExprFinished:
496            break;
497
498        case RBBIRuleParseTable.doRuleErrorAssignExpr:
499            error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
500            returnVal = false;
501            break;
502
503        case RBBIRuleParseTable.doExit:
504            returnVal = false;
505            break;
506
507        case RBBIRuleParseTable.doScanUnicodeSet:
508            scanSet();
509            break;
510
511        default:
512            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
513            returnVal = false;
514            break;
515        }
516        return returnVal;
517    }
518
519    //----------------------------------------------------------------------------------------
520    //
521    //  Error Throw and IllegalArgumentException in response to a rule parse
522    // error.
523    //
524    //----------------------------------------------------------------------------------------
525    void error(int e) {
526        String s = "Error " + e + " at line " + fLineNum + " column "
527                + fCharNum;
528        IllegalArgumentException ex = new IllegalArgumentException(s);
529        throw ex;
530
531    }
532
533    //----------------------------------------------------------------------------------------
534    //
535    //  fixOpStack The parse stack holds partially assembled chunks of the parse
536    // tree.
537    //               An entry on the stack may be as small as a single setRef node,
538    //               or as large as the parse tree
539    //               for an entire expression (this will be the one item left on the stack
540    //               when the parsing of an RBBI rule completes.
541    //
542    //               This function is called when a binary operator is encountered.
543    //               It looks back up the stack for operators that are not yet associated
544    //               with a right operand, and if the precedence of the stacked operator >=
545    //               the precedence of the current operator, binds the operand left,
546    //               to the previously encountered operator.
547    //
548    //----------------------------------------------------------------------------------------
549    void fixOpStack(int p) {
550        RBBINode n;
551        // printNodeStack("entering fixOpStack()");
552        for (;;) {
553            n = fNodeStack[fNodeStackPtr - 1]; // an operator node
554            if (n.fPrecedence == 0) {
555                System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
556                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
557                return;
558            }
559
560            if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
561                // The most recent operand goes with the current operator,
562                //   not with the previously stacked one.
563                break;
564            }
565            // Stack operator is a binary op ( '|' or concatenation)
566            //   TOS operand becomes right child of this operator.
567            //   Resulting subexpression becomes the TOS operand.
568            n.fRightChild = fNodeStack[fNodeStackPtr];
569            fNodeStack[fNodeStackPtr].fParent = n;
570            fNodeStackPtr--;
571            // printNodeStack("looping in fixOpStack() ");
572        }
573
574        if (p <= RBBINode.precLParen) {
575            // Scan is at a right paren or end of expression.
576            //  The scanned item must match the stack, or else there was an
577            // error.
578            //  Discard the left paren (or start expr) node from the stack,
579            //  leaving the completed (sub)expression as TOS.
580            if (n.fPrecedence != p) {
581                // Right paren encountered matched start of expression node, or
582                // end of expression matched with a left paren node.
583                error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
584            }
585            fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
586            fNodeStackPtr--;
587            // Delete the now-discarded LParen or Start node.
588            // delete n;
589        }
590        // printNodeStack("leaving fixOpStack()");
591    }
592
593    //----------------------------------------------------------------------------
594    //
595    //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
596    //                        been encountered. The val Node will be of nodetype uset
597    //                        and contain pointers to the actual UnicodeSets.
598    //                        The Key is the source string for initializing the set.
599    //
600    //                        The hash table is used to avoid creating duplicate
601    //                        unnamed (not $var references) UnicodeSets.
602    //
603    //----------------------------------------------------------------------------
604    static class RBBISetTableEl {
605        String key;
606
607        RBBINode val;
608    }
609
610
611    //----------------------------------------------------------------------------------------
612    //
613    //   findSetFor given a String,
614    //                  - find the corresponding Unicode Set (uset node)
615    //                         (create one if necessary)
616    //                  - Set fLeftChild of the caller's node (should be a setRef node)
617    //                         to the uset node
618    //                 Maintain a hash table of uset nodes, so the same one is always used
619    //                    for the same string.
620    //                 If a "to adopt" set is provided and we haven't seen this key before,
621    //                    add the provided set to the hash table.
622    //                 If the string is one (32 bit) char in length, the set contains
623    //                    just one element which is the char in question.
624    //                 If the string is "any", return a set containing all chars.
625    //
626    //----------------------------------------------------------------------------------------
627    void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
628
629        RBBISetTableEl el;
630
631        // First check whether we've already cached a set for this string.
632        // If so, just use the cached set in the new node.
633        //   delete any set provided by the caller, since we own it.
634        el = fSetTable.get(s);
635        if (el != null) {
636            node.fLeftChild = el.val;
637            Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
638            return;
639        }
640
641        // Haven't seen this set before.
642        // If the caller didn't provide us with a prebuilt set,
643        //   create a new UnicodeSet now.
644        if (setToAdopt == null) {
645            if (s.equals(kAny)) {
646                setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
647            } else {
648                int c;
649                c = UTF16.charAt(s, 0);
650                setToAdopt = new UnicodeSet(c, c);
651            }
652        }
653
654        //
655        // Make a new uset node to refer to this UnicodeSet
656        // This new uset node becomes the child of the caller's setReference
657        // node.
658        //
659        RBBINode usetNode = new RBBINode(RBBINode.uset);
660        usetNode.fInputSet = setToAdopt;
661        usetNode.fParent = node;
662        node.fLeftChild = usetNode;
663        usetNode.fText = s;
664
665        //
666        // Add the new uset node to the list of all uset nodes.
667        //
668        fRB.fUSetNodes.add(usetNode);
669
670        //
671        // Add the new set to the set hash table.
672        //
673        el = new RBBISetTableEl();
674        el.key = s;
675        el.val = usetNode;
676        fSetTable.put(el.key, el);
677
678        return;
679    }
680
681    //
682    //  Assorted Unicode character constants.
683    //     Numeric because there is no portable way to enter them as literals.
684    //     (Think EBCDIC).
685    //
686    static final int chNEL = 0x85; //    NEL newline variant
687
688    static final int chLS = 0x2028; //    Unicode Line Separator
689
690    //----------------------------------------------------------------------------------------
691    //
692    //  stripRules    Return a rules string without unnecessary
693    //                characters.
694    //
695    //----------------------------------------------------------------------------------------
696    static String stripRules(String rules) {
697        StringBuilder strippedRules = new StringBuilder();
698        int rulesLength = rules.length();
699        for (int idx = 0; idx < rulesLength;) {
700            char ch = rules.charAt(idx++);
701            if (ch == '#') {
702                while (idx < rulesLength
703                        && ch != '\r' && ch != '\n' && ch != chNEL) {
704                    ch = rules.charAt(idx++);
705                }
706            }
707            if (!UCharacter.isISOControl(ch)) {
708                strippedRules.append(ch);
709            }
710        }
711        return strippedRules.toString();
712    }
713
714    //----------------------------------------------------------------------------------------
715    //
716    //  nextCharLL    Low Level Next Char from rule input source.
717    //                Get a char from the input character iterator,
718    //                keep track of input position for error reporting.
719    //
720    //----------------------------------------------------------------------------------------
721    int nextCharLL() {
722        int ch;
723
724        if (fNextIndex >= fRB.fRules.length()) {
725            return -1;
726        }
727        ch = UTF16.charAt(fRB.fRules, fNextIndex);
728        fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
729
730        if (ch == '\r' ||
731            ch == chNEL ||
732            ch == chLS ||
733            ch == '\n' && fLastChar != '\r') {
734            // Character is starting a new line.  Bump up the line number, and
735            //  reset the column to 0.
736            fLineNum++;
737            fCharNum = 0;
738            if (fQuoteMode) {
739                error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
740                fQuoteMode = false;
741            }
742        } else {
743            // Character is not starting a new line.  Except in the case of a
744            //   LF following a CR, increment the column position.
745            if (ch != '\n') {
746                fCharNum++;
747            }
748        }
749        fLastChar = ch;
750        return ch;
751    }
752
753    //---------------------------------------------------------------------------------
754    //
755    //   nextChar     for rules scanning.  At this level, we handle stripping
756    //                out comments and processing backslash character escapes.
757    //                The rest of the rules grammar is handled at the next level up.
758    //
759    //---------------------------------------------------------------------------------
760    void nextChar(RBBIRuleChar c) {
761
762        // Unicode Character constants needed for the processing done by nextChar(),
763        //   in hex because literals wont work on EBCDIC machines.
764
765        fScanIndex = fNextIndex;
766        c.fChar = nextCharLL();
767        c.fEscaped = false;
768
769        //
770        //  check for '' sequence.
771        //  These are recognized in all contexts, whether in quoted text or not.
772        //
773        if (c.fChar == '\'') {
774            if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
775                c.fChar = nextCharLL(); // get nextChar officially so character counts
776                c.fEscaped = true; //   stay correct.
777            } else {
778                // Single quote, by itself.
779                //   Toggle quoting mode.
780                //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
781                fQuoteMode = !fQuoteMode;
782                if (fQuoteMode == true) {
783                    c.fChar = '(';
784                } else {
785                    c.fChar = ')';
786                }
787                c.fEscaped = false; // The paren that we return is not escaped.
788                return;
789            }
790        }
791
792        if (fQuoteMode) {
793            c.fEscaped = true;
794        } else {
795            // We are not in a 'quoted region' of the source.
796            //
797            if (c.fChar == '#') {
798                // Start of a comment.  Consume the rest of it.
799                //  The new-line char that terminates the comment is always returned.
800                //  It will be treated as white-space, and serves to break up anything
801                //    that might otherwise incorrectly clump together with a comment in
802                //    the middle (a variable name, for example.)
803                for (;;) {
804                    c.fChar = nextCharLL();
805                    if (c.fChar == -1 || // EOF
806                        c.fChar == '\r' ||
807                        c.fChar == '\n' ||
808                        c.fChar == chNEL ||
809                        c.fChar == chLS)
810                    {
811                        break;
812                    }
813                }
814            }
815            if (c.fChar == -1) {
816                return;
817            }
818
819            //
820            //  check for backslash escaped characters.
821            //  Use String.unescapeAt() to handle them.
822            //
823            if (c.fChar == '\\') {
824                c.fEscaped = true;
825                int[] unescapeIndex = new int[1];
826                unescapeIndex[0] = fNextIndex;
827                c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
828                if (unescapeIndex[0] == fNextIndex) {
829                    error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
830                }
831
832                fCharNum += unescapeIndex[0] - fNextIndex;
833                fNextIndex = unescapeIndex[0];
834            }
835        }
836        // putc(c.fChar, stdout);
837    }
838
839    //---------------------------------------------------------------------------------
840    //
841    //  Parse RBBI rules.   The state machine for rules parsing is here.
842    //                      The state tables are hand-written in the file rbbirpt.txt,
843    //                      and converted to the form used here by a perl
844    //                      script rbbicst.pl
845    //
846    //---------------------------------------------------------------------------------
847    void parse() {
848        int state;
849        RBBIRuleParseTable.RBBIRuleTableElement tableEl;
850
851        state = 1;
852        nextChar(fC);
853        //
854        // Main loop for the rule parsing state machine.
855        //   Runs once per state transition.
856        //   Each time through optionally performs, depending on the state table,
857        //      - an advance to the the next input char
858        //      - an action to be performed.
859        //      - pushing or popping a state to/from the local state return stack.
860        //
861        for (;;) {
862            // Quit if state == 0.  This is the normal way to exit the state machine.
863            //
864            if (state == 0) {
865                break;
866            }
867
868            // Find the state table element that matches the input char from the rule, or the
869            //    class of the input character.  Start with the first table row for this
870            //    state, then linearly scan forward until we find a row that matches the
871            //    character.  The last row for each state always matches all characters, so
872            //    the search will stop there, if not before.
873            //
874            tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
875            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
876                System.out.println("char, line, col = (\'" + (char) fC.fChar
877                        + "\', " + fLineNum + ", " + fCharNum + "    state = "
878                        + tableEl.fStateName);
879            }
880
881            for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
882                tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
883                if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
884                    System.out.print(".");
885                }
886                if (tableEl.fCharClass < 127 && fC.fEscaped == false
887                        && tableEl.fCharClass == fC.fChar) {
888                    // Table row specified an individual character, not a set, and
889                    //   the input character is not escaped, and
890                    //   the input character matched it.
891                    break;
892                }
893                if (tableEl.fCharClass == 255) {
894                    // Table row specified default, match anything character class.
895                    break;
896                }
897                if (tableEl.fCharClass == 254 && fC.fEscaped) {
898                    // Table row specified "escaped" and the char was escaped.
899                    break;
900                }
901                if (tableEl.fCharClass == 253 && fC.fEscaped
902                        && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
903                    // Table row specified "escaped P" and the char is either 'p' or 'P'.
904                    break;
905                }
906                if (tableEl.fCharClass == 252 && fC.fChar == -1) {
907                    // Table row specified eof and we hit eof on the input.
908                    break;
909                }
910
911                if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
912                        fC.fEscaped == false && //   char is not escaped &&
913                        fC.fChar != -1) { //   char is not EOF
914                    UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
915                    if (uniset.contains(fC.fChar)) {
916                        // Table row specified a character class, or set of characters,
917                        //   and the current char matches it.
918                        break;
919                    }
920                }
921            }
922
923            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
924                System.out.println("");
925            }
926            //
927            // We've found the row of the state table that matches the current input
928            //   character from the rules string.
929            // Perform any action specified  by this row in the state table.
930            if (doParseActions(tableEl.fAction) == false) {
931                // Break out of the state machine loop if the
932                //   the action signalled some kind of error, or
933                //   the action was to exit, occurs on normal end-of-rules-input.
934                break;
935            }
936
937            if (tableEl.fPushState != 0) {
938                fStackPtr++;
939                if (fStackPtr >= kStackSize) {
940                    System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
941                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
942                }
943                fStack[fStackPtr] = tableEl.fPushState;
944            }
945
946            if (tableEl.fNextChar) {
947                nextChar(fC);
948            }
949
950            // Get the next state from the table entry, or from the
951            //   state stack if the next state was specified as "pop".
952            if (tableEl.fNextState != 255) {
953                state = tableEl.fNextState;
954            } else {
955                state = fStack[fStackPtr];
956                fStackPtr--;
957                if (fStackPtr < 0) {
958                    System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
959                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
960                }
961            }
962
963        }
964
965        // If there are no forward rules throw an error.
966        //
967        if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
968            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
969        }
970
971        //
972        // If there were NO user specified reverse rules, set up the equivalent of ".*;"
973        //
974        if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
975            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
976            RBBINode operand = pushNewNode(RBBINode.setRef);
977            findSetFor(kAny, operand, null);
978            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
979            operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
980            fNodeStackPtr -= 2;
981        }
982
983        //
984        // Parsing of the input RBBI rules is complete.
985        // We now have a parse tree for the rule expressions
986        // and a list of all UnicodeSets that are referenced.
987        //
988        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
989            fSymbolTable.rbbiSymtablePrint();
990        }
991        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
992            System.out.println("Completed Forward Rules Parse Tree...");
993            fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
994            System.out.println("\nCompleted Reverse Rules Parse Tree...");
995            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
996            System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
997            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
998                System.out.println("  -- null -- ");
999            } else {
1000                fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
1001            }
1002            System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
1003            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
1004                System.out.println("  -- null -- ");
1005            } else {
1006                fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
1007            }
1008        }
1009    }
1010
1011    //---------------------------------------------------------------------------------
1012    //
1013    //  printNodeStack     for debugging...
1014    //
1015    //---------------------------------------------------------------------------------
1016    ///CLOVER:OFF
1017    void printNodeStack(String title) {
1018        int i;
1019        System.out.println(title + ".  Dumping node stack...\n");
1020        for (i = fNodeStackPtr; i > 0; i--) {
1021            fNodeStack[i].printTree(true);
1022        }
1023    }
1024    ///CLOVER:ON
1025
1026    //---------------------------------------------------------------------------------
1027    //
1028    //  pushNewNode   create a new RBBINode of the specified type and push it
1029    //                onto the stack of nodes.
1030    //
1031    //---------------------------------------------------------------------------------
1032    RBBINode pushNewNode(int nodeType) {
1033        fNodeStackPtr++;
1034        if (fNodeStackPtr >= kStackSize) {
1035            System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1036            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1037        }
1038        fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1039        return fNodeStack[fNodeStackPtr];
1040    }
1041
1042    //---------------------------------------------------------------------------------
1043    //
1044    //  scanSet    Construct a UnicodeSet from the text at the current scan
1045    //             position.  Advance the scan position to the first character
1046    //             after the set.
1047    //
1048    //             A new RBBI setref node referring to the set is pushed onto the node
1049    //             stack.
1050    //
1051    //             The scan position is normally under the control of the state machine
1052    //             that controls rule parsing.  UnicodeSets, however, are parsed by
1053    //             the UnicodeSet constructor, not by the RBBI rule parser.
1054    //
1055    //---------------------------------------------------------------------------------
1056    void scanSet() {
1057        UnicodeSet uset = null;
1058        int startPos;
1059        ParsePosition pos = new ParsePosition(fScanIndex);
1060        int i;
1061
1062        startPos = fScanIndex;
1063        try {
1064            uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1065        } catch (Exception e) { // TODO:  catch fewer exception types.
1066            // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1067            error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1068        }
1069
1070        // Verify that the set contains at least one code point.
1071        //
1072        if (uset.isEmpty()) {
1073            // This set is empty.
1074            //  Make it an error, because it almost certainly is not what the user wanted.
1075            //  Also, avoids having to think about corner cases in the tree manipulation code
1076            //   that occurs later on.
1077            //  TODO:  this shouldn't be an error; it does happen.
1078            error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1079        }
1080
1081        // Advance the RBBI parse postion over the UnicodeSet pattern.
1082        //   Don't just set fScanIndex because the line/char positions maintained
1083        //   for error reporting would be thrown off.
1084        i = pos.getIndex();
1085        for (;;) {
1086            if (fNextIndex >= i) {
1087                break;
1088            }
1089            nextCharLL();
1090        }
1091
1092        RBBINode n;
1093
1094        n = pushNewNode(RBBINode.setRef);
1095        n.fFirstPos = startPos;
1096        n.fLastPos = fNextIndex;
1097        n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1098        //  findSetFor() serves several purposes here:
1099        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1100        //     - Mantains collection of all sets in use, needed later for establishing
1101        //          character categories for run time engine.
1102        //     - Eliminates mulitiple instances of the same set.
1103        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1104        findSetFor(n.fText, n, uset);
1105    }
1106
1107}
1108
1109