1/*
2 *******************************************************************************
3 * Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
4 *******************************************************************************
5 */
6
7package com.ibm.icu.text;
8
9import java.text.ParsePosition;
10import java.util.HashMap;
11
12import com.ibm.icu.impl.Assert;
13import com.ibm.icu.impl.Utility;
14import com.ibm.icu.lang.UCharacter;
15
16/**
17  *  This class is part of the Rule Based Break Iterator rule compiler.
18  *  It scans the rules and builds the parse tree.
19  *  There is no public API here.
20  */
21class RBBIRuleScanner {
22
23    private final static int    kStackSize = 100;               // The size of the state stack for
24    //   rules parsing.  Corresponds roughly
25    //   to the depth of parentheses nesting
26    //   that is allowed in the rules.
27
28    static class RBBIRuleChar {
29        int             fChar;
30        boolean         fEscaped;
31    }
32
33
34
35    RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
36
37    int                       fScanIndex;        // Index of current character being processed
38                                                     //   in the rule input string.
39    int                       fNextIndex;        // Index of the next character, which
40                                                     //   is the first character not yet scanned.
41    boolean                  fQuoteMode;        // Scan is in a 'quoted region'
42    int                       fLineNum;          // Line number in input file.
43    int                       fCharNum;          // Char position within the line.
44    int                       fLastChar;         // Previous char, needed to count CR-LF
45                                                     //   as a single line, not two.
46
47    RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
48                                                     //   processing.
49    String                    fVarName;          // $variableName, valid when we've just
50                                                     //   scanned one.
51
52
53    short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
54    int                       fStackPtr;           //  and pops as specified in the state
55                                                       //  transition rules.
56
57    RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
58                                                           //  during the parse of a rule
59    int                        fNodeStackPtr;
60
61
62    boolean                          fReverseRule;     // True if the rule currently being scanned
63                                                     //  is a reverse direction rule (if it
64                                                     //  starts with a '!')
65
66    boolean                          fLookAheadRule;   // True if the rule includes a '/'
67                                                     //   somewhere within it.
68
69    RBBISymbolTable              fSymbolTable;     // symbol table, holds definitions of
70                                                     //   $variable symbols.
71
72    HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
73                                                                                       //   the sets created while parsing rules.
74                                                                                       //   The key is the string used for creating
75                                                                                       //   the set.
76
77    UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
78                                                     //  the scanning of RBBI rules.  The
79                                                     //  indicies for these are assigned by the
80                                                     //  perl script that builds the state tables.
81                                                     //  See rbbirpt.h.
82
83    int                        fRuleNum;         // Counts each rule as it is scanned.
84
85    int                        fOptionStart;     // Input index of start of a !!option
86                                                 //   keyword, while being scanned.
87
88
89
90   static private String gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
91   static private String gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
92   static private String gRuleSet_digit_char_pattern      = "[0-9]";
93   static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
94   static private String gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
95   static private String kAny =  "any";
96
97
98
99
100    //----------------------------------------------------------------------------------------
101    //
102    //  Constructor.
103    //
104    //----------------------------------------------------------------------------------------
105    RBBIRuleScanner(RBBIRuleBuilder rb) {
106        fRB = rb;
107        fLineNum = 1;
108
109        //
110        //  Set up the constant Unicode Sets.
111        //     Note: These could be made static and shared among
112        //            all instances of RBBIRuleScanners.
113        fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
114        fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
115        fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
116        fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
117        fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
118
119        fSymbolTable = new RBBISymbolTable(this, rb.fRules);
120    }
121
122    //----------------------------------------------------------------------------------------
123    //
124    //  doParseAction Do some action during rule parsing.
125    //                       Called by the parse state machine.
126    //                       Actions build the parse tree and Unicode Sets,
127    //                       and maintain the parse stack for nested expressions.
128    //
129    //----------------------------------------------------------------------------------------
130    boolean doParseActions(int action) {
131        RBBINode n = null;
132
133        boolean returnVal = true;
134
135        switch (action) {
136
137        case RBBIRuleParseTable.doExprStart:
138            pushNewNode(RBBINode.opStart);
139            fRuleNum++;
140            break;
141
142        case RBBIRuleParseTable.doExprOrOperator: {
143            fixOpStack(RBBINode.precOpCat);
144            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
145            RBBINode orNode = pushNewNode(RBBINode.opOr);
146            orNode.fLeftChild = operandNode;
147            operandNode.fParent = orNode;
148        }
149            break;
150
151        case RBBIRuleParseTable.doExprCatOperator:
152        // concatenation operator.
153        // For the implicit concatenation of adjacent terms in an expression
154        // that are
155        //   not separated by any other operator. Action is invoked between the
156        //   actions for the two terms.
157        {
158            fixOpStack(RBBINode.precOpCat);
159            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
160            RBBINode catNode = pushNewNode(RBBINode.opCat);
161            catNode.fLeftChild = operandNode;
162            operandNode.fParent = catNode;
163        }
164            break;
165
166        case RBBIRuleParseTable.doLParen:
167            // Open Paren.
168            //   The openParen node is a dummy operation type with a low
169            // precedence,
170            //     which has the affect of ensuring that any real binary op that
171            //     follows within the parens binds more tightly to the operands than
172            //     stuff outside of the parens.
173            pushNewNode(RBBINode.opLParen);
174            break;
175
176        case RBBIRuleParseTable.doExprRParen:
177            fixOpStack(RBBINode.precLParen);
178            break;
179
180        case RBBIRuleParseTable.doNOP:
181            break;
182
183        case RBBIRuleParseTable.doStartAssign:
184            // We've just scanned "$variable = "
185            // The top of the node stack has the $variable ref node.
186
187            // Save the start position of the RHS text in the StartExpression
188            // node
189            //   that precedes the $variableReference node on the stack.
190            //   This will eventually be used when saving the full $variable
191            // replacement
192            //   text as a string.
193            n = fNodeStack[fNodeStackPtr - 1];
194            n.fFirstPos = fNextIndex; // move past the '='
195
196            // Push a new start-of-expression node; needed to keep parse of the
197            //   RHS expression happy.
198            pushNewNode(RBBINode.opStart);
199            break;
200
201        case RBBIRuleParseTable.doEndAssign: {
202            // We have reached the end of an assignement statement.
203            //   Current scan char is the ';' that terminates the assignment.
204
205            // Terminate expression, leaves expression parse tree rooted in TOS
206            // node.
207            fixOpStack(RBBINode.precStart);
208
209            RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
210            RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
211            RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
212
213            // Save original text of right side of assignment, excluding the
214            // terminating ';'
215            //  in the root of the node for the right-hand-side expression.
216            RHSExprNode.fFirstPos = startExprNode.fFirstPos;
217            RHSExprNode.fLastPos = fScanIndex;
218            // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
219            // RHSExprNode.fLastPos, RHSExprNode.fText);
220            RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
221                    RHSExprNode.fLastPos);
222
223            // Expression parse tree becomes l. child of the $variable reference
224            // node.
225            varRefNode.fLeftChild = RHSExprNode;
226            RHSExprNode.fParent = varRefNode;
227
228            // Make a symbol table entry for the $variableRef node.
229            fSymbolTable.addEntry(varRefNode.fText, varRefNode);
230
231            // Clean up the stack.
232            fNodeStackPtr -= 3;
233            break;
234        }
235
236        case RBBIRuleParseTable.doEndOfRule: {
237            fixOpStack(RBBINode.precStart); // Terminate expression, leaves
238                                            // expression
239
240            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
241                printNodeStack("end of rule");
242            }
243            Assert.assrt(fNodeStackPtr == 1);
244
245            // If this rule includes a look-ahead '/', add a endMark node to the
246            //   expression tree.
247            if (fLookAheadRule) {
248                RBBINode thisRule = fNodeStack[fNodeStackPtr];
249                RBBINode endNode = pushNewNode(RBBINode.endMark);
250                RBBINode catNode = pushNewNode(RBBINode.opCat);
251                fNodeStackPtr -= 2;
252                catNode.fLeftChild = thisRule;
253                catNode.fRightChild = endNode;
254                fNodeStack[fNodeStackPtr] = catNode;
255                endNode.fVal = fRuleNum;
256                endNode.fLookAheadEnd = true;
257            }
258
259            // All rule expressions are ORed together.
260            // The ';' that terminates an expression really just functions as a
261            // '|' with
262            //   a low operator prededence.
263            //
264            // Each of the four sets of rules are collected separately.
265            //  (forward, reverse, safe_forward, safe_reverse)
266            //  OR this rule into the appropriate group of them.
267            //
268
269            int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
270
271            if (fRB.fTreeRoots[destRules] != null) {
272                // This is not the first rule encounted.
273                // OR previous stuff (from *destRules)
274                // with the current rule expression (on the Node Stack)
275                //  with the resulting OR expression going to *destRules
276                //
277                RBBINode thisRule = fNodeStack[fNodeStackPtr];
278                RBBINode prevRules = fRB.fTreeRoots[destRules];
279                RBBINode orNode = pushNewNode(RBBINode.opOr);
280                orNode.fLeftChild = prevRules;
281                prevRules.fParent = orNode;
282                orNode.fRightChild = thisRule;
283                thisRule.fParent = orNode;
284                fRB.fTreeRoots[destRules] = orNode;
285            } else {
286                // This is the first rule encountered (for this direction).
287                // Just move its parse tree from the stack to *destRules.
288                fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
289            }
290            fReverseRule = false; // in preparation for the next rule.
291            fLookAheadRule = false;
292            fNodeStackPtr = 0;
293        }
294            break;
295
296        case RBBIRuleParseTable.doRuleError:
297            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
298            returnVal = false;
299            break;
300
301        case RBBIRuleParseTable.doVariableNameExpectedErr:
302            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
303            break;
304
305        //
306        //  Unary operands + ? *
307        //    These all appear after the operand to which they apply.
308        //    When we hit one, the operand (may be a whole sub expression)
309        //    will be on the top of the stack.
310        //    Unary Operator becomes TOS, with the old TOS as its one child.
311        case RBBIRuleParseTable.doUnaryOpPlus: {
312            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
313            RBBINode plusNode = pushNewNode(RBBINode.opPlus);
314            plusNode.fLeftChild = operandNode;
315            operandNode.fParent = plusNode;
316        }
317            break;
318
319        case RBBIRuleParseTable.doUnaryOpQuestion: {
320            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
321            RBBINode qNode = pushNewNode(RBBINode.opQuestion);
322            qNode.fLeftChild = operandNode;
323            operandNode.fParent = qNode;
324        }
325            break;
326
327        case RBBIRuleParseTable.doUnaryOpStar: {
328            RBBINode operandNode = fNodeStack[fNodeStackPtr--];
329            RBBINode starNode = pushNewNode(RBBINode.opStar);
330            starNode.fLeftChild = operandNode;
331            operandNode.fParent = starNode;
332        }
333            break;
334
335        case RBBIRuleParseTable.doRuleChar:
336        // A "Rule Character" is any single character that is a literal part
337        // of the regular expression. Like a, b and c in the expression "(abc*)
338        // | [:L:]"
339        // These are pretty uncommon in break rules; the terms are more commonly
340        //  sets. To keep things uniform, treat these characters like as
341        // sets that just happen to contain only one character.
342        {
343            n = pushNewNode(RBBINode.setRef);
344            String s = String.valueOf((char)fC.fChar);
345            findSetFor(s, n, null);
346            n.fFirstPos = fScanIndex;
347            n.fLastPos = fNextIndex;
348            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
349            break;
350        }
351
352        case RBBIRuleParseTable.doDotAny:
353        // scanned a ".", meaning match any single character.
354        {
355            n = pushNewNode(RBBINode.setRef);
356            findSetFor(kAny, n, null);
357            n.fFirstPos = fScanIndex;
358            n.fLastPos = fNextIndex;
359            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
360            break;
361        }
362
363        case RBBIRuleParseTable.doSlash:
364            // Scanned a '/', which identifies a look-ahead break position in a
365            // rule.
366            n = pushNewNode(RBBINode.lookAhead);
367            n.fVal = fRuleNum;
368            n.fFirstPos = fScanIndex;
369            n.fLastPos = fNextIndex;
370            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
371            fLookAheadRule = true;
372            break;
373
374        case RBBIRuleParseTable.doStartTagValue:
375            // Scanned a '{', the opening delimiter for a tag value within a
376            // rule.
377            n = pushNewNode(RBBINode.tag);
378            n.fVal = 0;
379            n.fFirstPos = fScanIndex;
380            n.fLastPos = fNextIndex;
381            break;
382
383        case RBBIRuleParseTable.doTagDigit:
384        // Just scanned a decimal digit that's part of a tag value
385        {
386            n = fNodeStack[fNodeStackPtr];
387            int v = UCharacter.digit((char) fC.fChar, 10);
388            n.fVal = n.fVal * 10 + v;
389            break;
390        }
391
392        case RBBIRuleParseTable.doTagValue:
393            n = fNodeStack[fNodeStackPtr];
394            n.fLastPos = fNextIndex;
395            n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
396            break;
397
398        case RBBIRuleParseTable.doTagExpectedError:
399            error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
400            returnVal = false;
401            break;
402
403        case RBBIRuleParseTable.doOptionStart:
404            // Scanning a !!option. At the start of string.
405            fOptionStart = fScanIndex;
406            break;
407
408        case RBBIRuleParseTable.doOptionEnd: {
409            String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
410            if (opt.equals("chain")) {
411                fRB.fChainRules = true;
412            } else if (opt.equals("LBCMNoChain")) {
413                fRB.fLBCMNoChain = true;
414            } else if (opt.equals("forward")) {
415                fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
416            } else if (opt.equals("reverse")) {
417                fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
418            } else if (opt.equals("safe_forward")) {
419                fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
420            } else if (opt.equals("safe_reverse")) {
421                fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
422            } else if (opt.equals("lookAheadHardBreak")) {
423                fRB.fLookAheadHardBreak = true;
424            } else {
425                error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
426            }
427            break;
428        }
429
430        case RBBIRuleParseTable.doReverseDir:
431            fReverseRule = true;
432            break;
433
434        case RBBIRuleParseTable.doStartVariableName:
435            n = pushNewNode(RBBINode.varRef);
436            n.fFirstPos = fScanIndex;
437            break;
438
439        case RBBIRuleParseTable.doEndVariableName:
440            n = fNodeStack[fNodeStackPtr];
441            if (n == null || n.fType != RBBINode.varRef) {
442                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
443                break;
444            }
445            n.fLastPos = fScanIndex;
446            n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
447            // Look the newly scanned name up in the symbol table
448            //   If there's an entry, set the l. child of the var ref to the
449            // replacement expression.
450            //   (We also pass through here when scanning assignments, but no harm
451            // is done, other
452            //    than a slight wasted effort that seems hard to avoid. Lookup will
453            // be null)
454            n.fLeftChild = fSymbolTable.lookupNode(n.fText);
455            break;
456
457        case RBBIRuleParseTable.doCheckVarDef:
458            n = fNodeStack[fNodeStackPtr];
459            if (n.fLeftChild == null) {
460                error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
461                returnVal = false;
462            }
463            break;
464
465        case RBBIRuleParseTable.doExprFinished:
466            break;
467
468        case RBBIRuleParseTable.doRuleErrorAssignExpr:
469            error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
470            returnVal = false;
471            break;
472
473        case RBBIRuleParseTable.doExit:
474            returnVal = false;
475            break;
476
477        case RBBIRuleParseTable.doScanUnicodeSet:
478            scanSet();
479            break;
480
481        default:
482            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
483            returnVal = false;
484            break;
485        }
486        return returnVal;
487    }
488
489    //----------------------------------------------------------------------------------------
490    //
491    //  Error Throw and IllegalArgumentException in response to a rule parse
492    // error.
493    //
494    //----------------------------------------------------------------------------------------
495    void error(int e) {
496        String s = "Error " + e + " at line " + fLineNum + " column "
497                + fCharNum;
498        IllegalArgumentException ex = new IllegalArgumentException(s);
499        throw ex;
500
501    }
502
503    //----------------------------------------------------------------------------------------
504    //
505    //  fixOpStack The parse stack holds partially assembled chunks of the parse
506    // tree.
507    //               An entry on the stack may be as small as a single setRef node,
508    //               or as large as the parse tree
509    //               for an entire expression (this will be the one item left on the stack
510    //               when the parsing of an RBBI rule completes.
511    //
512    //               This function is called when a binary operator is encountered.
513    //               It looks back up the stack for operators that are not yet associated
514    //               with a right operand, and if the precedence of the stacked operator >=
515    //               the precedence of the current operator, binds the operand left,
516    //               to the previously encountered operator.
517    //
518    //----------------------------------------------------------------------------------------
519    void fixOpStack(int p) {
520        RBBINode n;
521        // printNodeStack("entering fixOpStack()");
522        for (;;) {
523            n = fNodeStack[fNodeStackPtr - 1]; // an operator node
524            if (n.fPrecedence == 0) {
525                System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
526                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
527                return;
528            }
529
530            if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
531                // The most recent operand goes with the current operator,
532                //   not with the previously stacked one.
533                break;
534            }
535            // Stack operator is a binary op ( '|' or concatenation)
536            //   TOS operand becomes right child of this operator.
537            //   Resulting subexpression becomes the TOS operand.
538            n.fRightChild = fNodeStack[fNodeStackPtr];
539            fNodeStack[fNodeStackPtr].fParent = n;
540            fNodeStackPtr--;
541            // printNodeStack("looping in fixOpStack() ");
542        }
543
544        if (p <= RBBINode.precLParen) {
545            // Scan is at a right paren or end of expression.
546            //  The scanned item must match the stack, or else there was an
547            // error.
548            //  Discard the left paren (or start expr) node from the stack,
549            //  leaving the completed (sub)expression as TOS.
550            if (n.fPrecedence != p) {
551                // Right paren encountered matched start of expression node, or
552                // end of expression matched with a left paren node.
553                error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
554            }
555            fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
556            fNodeStackPtr--;
557            // Delete the now-discarded LParen or Start node.
558            // delete n;
559        }
560        // printNodeStack("leaving fixOpStack()");
561    }
562
563    //----------------------------------------------------------------------------
564    //
565    //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
566    //                        been encountered. The val Node will be of nodetype uset
567    //                        and contain pointers to the actual UnicodeSets.
568    //                        The Key is the source string for initializing the set.
569    //
570    //                        The hash table is used to avoid creating duplicate
571    //                        unnamed (not $var references) UnicodeSets.
572    //
573    //----------------------------------------------------------------------------
574    static class RBBISetTableEl {
575        String key;
576
577        RBBINode val;
578    }
579
580
581    //----------------------------------------------------------------------------------------
582    //
583    //   findSetFor given a String,
584    //                  - find the corresponding Unicode Set (uset node)
585    //                         (create one if necessary)
586    //                  - Set fLeftChild of the caller's node (should be a setRef node)
587    //                         to the uset node
588    //                 Maintain a hash table of uset nodes, so the same one is always used
589    //                    for the same string.
590    //                 If a "to adopt" set is provided and we haven't seen this key before,
591    //                    add the provided set to the hash table.
592    //                 If the string is one (32 bit) char in length, the set contains
593    //                    just one element which is the char in question.
594    //                 If the string is "any", return a set containing all chars.
595    //
596    //----------------------------------------------------------------------------------------
597    void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
598
599        RBBISetTableEl el;
600
601        // First check whether we've already cached a set for this string.
602        // If so, just use the cached set in the new node.
603        //   delete any set provided by the caller, since we own it.
604        el = fSetTable.get(s);
605        if (el != null) {
606            node.fLeftChild = el.val;
607            Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
608            return;
609        }
610
611        // Haven't seen this set before.
612        // If the caller didn't provide us with a prebuilt set,
613        //   create a new UnicodeSet now.
614        if (setToAdopt == null) {
615            if (s.equals(kAny)) {
616                setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
617            } else {
618                int c;
619                c = UTF16.charAt(s, 0);
620                setToAdopt = new UnicodeSet(c, c);
621            }
622        }
623
624        //
625        // Make a new uset node to refer to this UnicodeSet
626        // This new uset node becomes the child of the caller's setReference
627        // node.
628        //
629        RBBINode usetNode = new RBBINode(RBBINode.uset);
630        usetNode.fInputSet = setToAdopt;
631        usetNode.fParent = node;
632        node.fLeftChild = usetNode;
633        usetNode.fText = s;
634
635        //
636        // Add the new uset node to the list of all uset nodes.
637        //
638        fRB.fUSetNodes.add(usetNode);
639
640        //
641        // Add the new set to the set hash table.
642        //
643        el = new RBBISetTableEl();
644        el.key = s;
645        el.val = usetNode;
646        fSetTable.put(el.key, el);
647
648        return;
649    }
650
651    //
652    //  Assorted Unicode character constants.
653    //     Numeric because there is no portable way to enter them as literals.
654    //     (Think EBCDIC).
655    //
656    static final int chNEL = 0x85; //    NEL newline variant
657
658    static final int chLS = 0x2028; //    Unicode Line Separator
659
660    //----------------------------------------------------------------------------------------
661    //
662    //  stripRules    Return a rules string without unnecessary
663    //                characters.
664    //
665    //----------------------------------------------------------------------------------------
666    static String stripRules(String rules) {
667        StringBuilder strippedRules = new StringBuilder();
668        int rulesLength = rules.length();
669        for (int idx = 0; idx < rulesLength;) {
670            char ch = rules.charAt(idx++);
671            if (ch == '#') {
672                while (idx < rulesLength
673                        && ch != '\r' && ch != '\n' && ch != chNEL) {
674                    ch = rules.charAt(idx++);
675                }
676            }
677            if (!UCharacter.isISOControl(ch)) {
678                strippedRules.append(ch);
679            }
680        }
681        return strippedRules.toString();
682    }
683
684    //----------------------------------------------------------------------------------------
685    //
686    //  nextCharLL    Low Level Next Char from rule input source.
687    //                Get a char from the input character iterator,
688    //                keep track of input position for error reporting.
689    //
690    //----------------------------------------------------------------------------------------
691    int nextCharLL() {
692        int ch;
693
694        if (fNextIndex >= fRB.fRules.length()) {
695            return -1;
696        }
697        ch = UTF16.charAt(fRB.fRules, fNextIndex);
698        fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
699
700        if (ch == '\r' ||
701            ch == chNEL ||
702            ch == chLS ||
703            ch == '\n' && fLastChar != '\r') {
704            // Character is starting a new line.  Bump up the line number, and
705            //  reset the column to 0.
706            fLineNum++;
707            fCharNum = 0;
708            if (fQuoteMode) {
709                error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
710                fQuoteMode = false;
711            }
712        } else {
713            // Character is not starting a new line.  Except in the case of a
714            //   LF following a CR, increment the column position.
715            if (ch != '\n') {
716                fCharNum++;
717            }
718        }
719        fLastChar = ch;
720        return ch;
721    }
722
723    //---------------------------------------------------------------------------------
724    //
725    //   nextChar     for rules scanning.  At this level, we handle stripping
726    //                out comments and processing backslash character escapes.
727    //                The rest of the rules grammar is handled at the next level up.
728    //
729    //---------------------------------------------------------------------------------
730    void nextChar(RBBIRuleChar c) {
731
732        // Unicode Character constants needed for the processing done by nextChar(),
733        //   in hex because literals wont work on EBCDIC machines.
734
735        fScanIndex = fNextIndex;
736        c.fChar = nextCharLL();
737        c.fEscaped = false;
738
739        //
740        //  check for '' sequence.
741        //  These are recognized in all contexts, whether in quoted text or not.
742        //
743        if (c.fChar == '\'') {
744            if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
745                c.fChar = nextCharLL(); // get nextChar officially so character counts
746                c.fEscaped = true; //   stay correct.
747            } else {
748                // Single quote, by itself.
749                //   Toggle quoting mode.
750                //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
751                fQuoteMode = !fQuoteMode;
752                if (fQuoteMode == true) {
753                    c.fChar = '(';
754                } else {
755                    c.fChar = ')';
756                }
757                c.fEscaped = false; // The paren that we return is not escaped.
758                return;
759            }
760        }
761
762        if (fQuoteMode) {
763            c.fEscaped = true;
764        } else {
765            // We are not in a 'quoted region' of the source.
766            //
767            if (c.fChar == '#') {
768                // Start of a comment.  Consume the rest of it.
769                //  The new-line char that terminates the comment is always returned.
770                //  It will be treated as white-space, and serves to break up anything
771                //    that might otherwise incorrectly clump together with a comment in
772                //    the middle (a variable name, for example.)
773                for (;;) {
774                    c.fChar = nextCharLL();
775                    if (c.fChar == -1 || // EOF
776                        c.fChar == '\r' ||
777                        c.fChar == '\n' ||
778                        c.fChar == chNEL ||
779                        c.fChar == chLS)
780                    {
781                        break;
782                    }
783                }
784            }
785            if (c.fChar == -1) {
786                return;
787            }
788
789            //
790            //  check for backslash escaped characters.
791            //  Use String.unescapeAt() to handle them.
792            //
793            if (c.fChar == '\\') {
794                c.fEscaped = true;
795                int[] unescapeIndex = new int[1];
796                unescapeIndex[0] = fNextIndex;
797                c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
798                if (unescapeIndex[0] == fNextIndex) {
799                    error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
800                }
801
802                fCharNum += unescapeIndex[0] - fNextIndex;
803                fNextIndex = unescapeIndex[0];
804            }
805        }
806        // putc(c.fChar, stdout);
807    }
808
809    //---------------------------------------------------------------------------------
810    //
811    //  Parse RBBI rules.   The state machine for rules parsing is here.
812    //                      The state tables are hand-written in the file rbbirpt.txt,
813    //                      and converted to the form used here by a perl
814    //                      script rbbicst.pl
815    //
816    //---------------------------------------------------------------------------------
817    void parse() {
818        int state;
819        RBBIRuleParseTable.RBBIRuleTableElement tableEl;
820
821        state = 1;
822        nextChar(fC);
823        //
824        // Main loop for the rule parsing state machine.
825        //   Runs once per state transition.
826        //   Each time through optionally performs, depending on the state table,
827        //      - an advance to the the next input char
828        //      - an action to be performed.
829        //      - pushing or popping a state to/from the local state return stack.
830        //
831        for (;;) {
832            // Quit if state == 0.  This is the normal way to exit the state machine.
833            //
834            if (state == 0) {
835                break;
836            }
837
838            // Find the state table element that matches the input char from the rule, or the
839            //    class of the input character.  Start with the first table row for this
840            //    state, then linearly scan forward until we find a row that matches the
841            //    character.  The last row for each state always matches all characters, so
842            //    the search will stop there, if not before.
843            //
844            tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
845            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
846                System.out.println("char, line, col = (\'" + (char) fC.fChar
847                        + "\', " + fLineNum + ", " + fCharNum + "    state = "
848                        + tableEl.fStateName);
849            }
850
851            for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
852                tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
853                if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
854                    System.out.print(".");
855                }
856                if (tableEl.fCharClass < 127 && fC.fEscaped == false
857                        && tableEl.fCharClass == fC.fChar) {
858                    // Table row specified an individual character, not a set, and
859                    //   the input character is not escaped, and
860                    //   the input character matched it.
861                    break;
862                }
863                if (tableEl.fCharClass == 255) {
864                    // Table row specified default, match anything character class.
865                    break;
866                }
867                if (tableEl.fCharClass == 254 && fC.fEscaped) {
868                    // Table row specified "escaped" and the char was escaped.
869                    break;
870                }
871                if (tableEl.fCharClass == 253 && fC.fEscaped
872                        && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
873                    // Table row specified "escaped P" and the char is either 'p' or 'P'.
874                    break;
875                }
876                if (tableEl.fCharClass == 252 && fC.fChar == -1) {
877                    // Table row specified eof and we hit eof on the input.
878                    break;
879                }
880
881                if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
882                        fC.fEscaped == false && //   char is not escaped &&
883                        fC.fChar != -1) { //   char is not EOF
884                    UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
885                    if (uniset.contains(fC.fChar)) {
886                        // Table row specified a character class, or set of characters,
887                        //   and the current char matches it.
888                        break;
889                    }
890                }
891            }
892
893            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
894                System.out.println("");
895            }
896            //
897            // We've found the row of the state table that matches the current input
898            //   character from the rules string.
899            // Perform any action specified  by this row in the state table.
900            if (doParseActions(tableEl.fAction) == false) {
901                // Break out of the state machine loop if the
902                //   the action signalled some kind of error, or
903                //   the action was to exit, occurs on normal end-of-rules-input.
904                break;
905            }
906
907            if (tableEl.fPushState != 0) {
908                fStackPtr++;
909                if (fStackPtr >= kStackSize) {
910                    System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
911                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
912                }
913                fStack[fStackPtr] = tableEl.fPushState;
914            }
915
916            if (tableEl.fNextChar) {
917                nextChar(fC);
918            }
919
920            // Get the next state from the table entry, or from the
921            //   state stack if the next state was specified as "pop".
922            if (tableEl.fNextState != 255) {
923                state = tableEl.fNextState;
924            } else {
925                state = fStack[fStackPtr];
926                fStackPtr--;
927                if (fStackPtr < 0) {
928                    System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
929                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
930                }
931            }
932
933        }
934
935        //
936        // If there were NO user specified reverse rules, set up the equivalent of ".*;"
937        //
938        if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
939            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
940            RBBINode operand = pushNewNode(RBBINode.setRef);
941            findSetFor(kAny, operand, null);
942            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
943            operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
944            fNodeStackPtr -= 2;
945        }
946
947        //
948        // Parsing of the input RBBI rules is complete.
949        // We now have a parse tree for the rule expressions
950        // and a list of all UnicodeSets that are referenced.
951        //
952        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
953            fSymbolTable.rbbiSymtablePrint();
954        }
955        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
956            System.out.println("Completed Forward Rules Parse Tree...");
957            fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
958            System.out.println("\nCompleted Reverse Rules Parse Tree...");
959            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
960            System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
961            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
962                System.out.println("  -- null -- ");
963            } else {
964                fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
965            }
966            System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
967            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
968                System.out.println("  -- null -- ");
969            } else {
970                fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
971            }
972        }
973    }
974
975    //---------------------------------------------------------------------------------
976    //
977    //  printNodeStack     for debugging...
978    //
979    //---------------------------------------------------------------------------------
980    ///CLOVER:OFF
981    void printNodeStack(String title) {
982        int i;
983        System.out.println(title + ".  Dumping node stack...\n");
984        for (i = fNodeStackPtr; i > 0; i--) {
985            fNodeStack[i].printTree(true);
986        }
987    }
988    ///CLOVER:ON
989
990    //---------------------------------------------------------------------------------
991    //
992    //  pushNewNode   create a new RBBINode of the specified type and push it
993    //                onto the stack of nodes.
994    //
995    //---------------------------------------------------------------------------------
996    RBBINode pushNewNode(int nodeType) {
997        fNodeStackPtr++;
998        if (fNodeStackPtr >= kStackSize) {
999            System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1000            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1001        }
1002        fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1003        return fNodeStack[fNodeStackPtr];
1004    }
1005
1006    //---------------------------------------------------------------------------------
1007    //
1008    //  scanSet    Construct a UnicodeSet from the text at the current scan
1009    //             position.  Advance the scan position to the first character
1010    //             after the set.
1011    //
1012    //             A new RBBI setref node referring to the set is pushed onto the node
1013    //             stack.
1014    //
1015    //             The scan position is normally under the control of the state machine
1016    //             that controls rule parsing.  UnicodeSets, however, are parsed by
1017    //             the UnicodeSet constructor, not by the RBBI rule parser.
1018    //
1019    //---------------------------------------------------------------------------------
1020    void scanSet() {
1021        UnicodeSet uset = null;
1022        int startPos;
1023        ParsePosition pos = new ParsePosition(fScanIndex);
1024        int i;
1025
1026        startPos = fScanIndex;
1027        try {
1028            uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1029        } catch (Exception e) { // TODO:  catch fewer exception types.
1030            // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1031            error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1032        }
1033
1034        // Verify that the set contains at least one code point.
1035        //
1036        if (uset.isEmpty()) {
1037            // This set is empty.
1038            //  Make it an error, because it almost certainly is not what the user wanted.
1039            //  Also, avoids having to think about corner cases in the tree manipulation code
1040            //   that occurs later on.
1041            //  TODO:  this shouldn't be an error; it does happen.
1042            error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1043        }
1044
1045        // Advance the RBBI parse postion over the UnicodeSet pattern.
1046        //   Don't just set fScanIndex because the line/char positions maintained
1047        //   for error reporting would be thrown off.
1048        i = pos.getIndex();
1049        for (;;) {
1050            if (fNextIndex >= i) {
1051                break;
1052            }
1053            nextCharLL();
1054        }
1055
1056        RBBINode n;
1057
1058        n = pushNewNode(RBBINode.setRef);
1059        n.fFirstPos = startPos;
1060        n.fLastPos = fNextIndex;
1061        n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1062        //  findSetFor() serves several purposes here:
1063        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1064        //     - Mantains collection of all sets in use, needed later for establishing
1065        //          character categories for run time engine.
1066        //     - Eliminates mulitiple instances of the same set.
1067        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1068        findSetFor(n.fText, n, uset);
1069    }
1070
1071}
1072
1073