1
2//
3//  file:  rbbiscan.cpp
4//
5//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
6//  All Rights Reserved.
7//
8//  This file contains the Rule Based Break Iterator Rule Builder functions for
9//   scanning the rules and assembling a parse tree.  This is the first phase
10//   of compiling the rules.
11//
12//  The overall of the rules is managed by class RBBIRuleBuilder, which will
13//  create and use an instance of this class as part of the process.
14//
15
16#include "unicode/utypes.h"
17
18#if !UCONFIG_NO_BREAK_ITERATION
19
20#include "unicode/unistr.h"
21#include "unicode/uniset.h"
22#include "unicode/uchar.h"
23#include "unicode/uchriter.h"
24#include "unicode/parsepos.h"
25#include "unicode/parseerr.h"
26#include "util.h"
27#include "cmemory.h"
28#include "cstring.h"
29
30#include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
31                       //   generated by a Perl script.
32#include "rbbirb.h"
33#include "rbbinode.h"
34#include "rbbiscan.h"
35#include "rbbitblb.h"
36
37#include "uassert.h"
38
39
40//------------------------------------------------------------------------------
41//
42// Unicode Set init strings for each of the character classes needed for parsing a rule file.
43//               (Initialized with hex values for portability to EBCDIC based machines.
44//                Really ugly, but there's no good way to avoid it.)
45//
46//              The sets are referred to by name in the rbbirpt.txt, which is the
47//              source form of the state transition table for the RBBI rule parser.
48//
49//------------------------------------------------------------------------------
50static const UChar gRuleSet_rule_char_pattern[]       = {
51 //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0
52    0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
53 //   -    \      u    0     0     7      f     ]     -     [    \      p
54    0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
55 //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]
56    0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
57
58static const UChar gRuleSet_name_char_pattern[]       = {
59//    [    _      \    p     {     L      }     \     p     {    N      }     ]
60    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
61
62static const UChar gRuleSet_digit_char_pattern[] = {
63//    [    0      -    9     ]
64    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
65
66static const UChar gRuleSet_name_start_char_pattern[] = {
67//    [    _      \    p     {     L      }     ]
68    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
69
70static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"
71
72
73U_CDECL_BEGIN
74static void U_CALLCONV RBBISetTable_deleter(void *p) {
75    U_NAMESPACE_QUALIFIER RBBISetTableEl *px = (U_NAMESPACE_QUALIFIER RBBISetTableEl *)p;
76    delete px->key;
77    // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.
78    //        Don't delete the value nodes here.
79    uprv_free(px);
80}
81U_CDECL_END
82
83U_NAMESPACE_BEGIN
84
85//------------------------------------------------------------------------------
86//
87//  Constructor.
88//
89//------------------------------------------------------------------------------
90RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
91{
92    fRB                 = rb;
93    fStackPtr           = 0;
94    fStack[fStackPtr]   = 0;
95    fNodeStackPtr       = 0;
96    fRuleNum            = 0;
97    fNodeStack[0]       = NULL;
98
99    fSymbolTable                            = NULL;
100    fSetTable                               = NULL;
101
102    fScanIndex = 0;
103    fNextIndex = 0;
104
105    fReverseRule        = FALSE;
106    fLookAheadRule      = FALSE;
107
108    fLineNum    = 1;
109    fCharNum    = 0;
110    fQuoteMode  = FALSE;
111
112    // Do not check status until after all critical fields are sufficiently initialized
113    //   that the destructor can run cleanly.
114    if (U_FAILURE(*rb->fStatus)) {
115        return;
116    }
117
118    //
119    //  Set up the constant Unicode Sets.
120    //     Note:  These could be made static, lazily initialized, and shared among
121    //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,
122    //            and the time to build these few sets should be small compared to a
123    //            full break iterator build.
124    fRuleSets[kRuleSet_rule_char-128]       = UnicodeSet(gRuleSet_rule_char_pattern,       *rb->fStatus);
125    UnicodeSet *whitespaceSet = uprv_openPatternWhiteSpaceSet(rb->fStatus);
126    if (U_FAILURE(*rb->fStatus)) {
127        return;
128    }
129    fRuleSets[kRuleSet_white_space-128]     = *whitespaceSet;
130    delete whitespaceSet;
131    fRuleSets[kRuleSet_name_char-128]       = UnicodeSet(gRuleSet_name_char_pattern,       *rb->fStatus);
132    fRuleSets[kRuleSet_name_start_char-128] = UnicodeSet(gRuleSet_name_start_char_pattern, *rb->fStatus);
133    fRuleSets[kRuleSet_digit_char-128]      = UnicodeSet(gRuleSet_digit_char_pattern,      *rb->fStatus);
134    if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
135        // This case happens if ICU's data is missing.  UnicodeSet tries to look up property
136        //   names from the init string, can't find them, and claims an illegal arguement.
137        //   Change the error so that the actual problem will be clearer to users.
138        *rb->fStatus = U_BRK_INIT_ERROR;
139    }
140    if (U_FAILURE(*rb->fStatus)) {
141        return;
142    }
143
144    fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
145    if (fSymbolTable == NULL) {
146        *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
147        return;
148    }
149    fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
150    if (U_FAILURE(*rb->fStatus)) {
151        return;
152    }
153    uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
154}
155
156
157
158//------------------------------------------------------------------------------
159//
160//  Destructor
161//
162//------------------------------------------------------------------------------
163RBBIRuleScanner::~RBBIRuleScanner() {
164    delete fSymbolTable;
165    if (fSetTable != NULL) {
166         uhash_close(fSetTable);
167         fSetTable = NULL;
168
169    }
170
171
172    // Node Stack.
173    //   Normally has one entry, which is the entire parse tree for the rules.
174    //   If errors occured, there may be additional subtrees left on the stack.
175    while (fNodeStackPtr > 0) {
176        delete fNodeStack[fNodeStackPtr];
177        fNodeStackPtr--;
178    }
179
180}
181
182//------------------------------------------------------------------------------
183//
184//  doParseAction        Do some action during rule parsing.
185//                       Called by the parse state machine.
186//                       Actions build the parse tree and Unicode Sets,
187//                       and maintain the parse stack for nested expressions.
188//
189//                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
190//                              They represent exactly the same thing.  They're separate
191//                              only to work around enum forward declaration restrictions
192//                              in some compilers, while at the same time avoiding multiple
193//                              definitions problems.  I'm sure that there's a better way.
194//
195//------------------------------------------------------------------------------
196UBool RBBIRuleScanner::doParseActions(int32_t action)
197{
198    RBBINode *n       = NULL;
199
200    UBool   returnVal = TRUE;
201
202    switch (action) {
203
204    case doExprStart:
205        pushNewNode(RBBINode::opStart);
206        fRuleNum++;
207        break;
208
209
210    case doExprOrOperator:
211        {
212            fixOpStack(RBBINode::precOpCat);
213            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
214            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
215            orNode->fLeftChild     = operandNode;
216            operandNode->fParent   = orNode;
217        }
218        break;
219
220    case doExprCatOperator:
221        // concatenation operator.
222        // For the implicit concatenation of adjacent terms in an expression that are
223        //   not separated by any other operator.  Action is invoked between the
224        //   actions for the two terms.
225        {
226            fixOpStack(RBBINode::precOpCat);
227            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
228            RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
229            catNode->fLeftChild    = operandNode;
230            operandNode->fParent   = catNode;
231        }
232        break;
233
234    case doLParen:
235        // Open Paren.
236        //   The openParen node is a dummy operation type with a low precedence,
237        //     which has the affect of ensuring that any real binary op that
238        //     follows within the parens binds more tightly to the operands than
239        //     stuff outside of the parens.
240        pushNewNode(RBBINode::opLParen);
241        break;
242
243    case doExprRParen:
244        fixOpStack(RBBINode::precLParen);
245        break;
246
247    case doNOP:
248        break;
249
250    case doStartAssign:
251        // We've just scanned "$variable = "
252        // The top of the node stack has the $variable ref node.
253
254        // Save the start position of the RHS text in the StartExpression node
255        //   that precedes the $variableReference node on the stack.
256        //   This will eventually be used when saving the full $variable replacement
257        //   text as a string.
258        n = fNodeStack[fNodeStackPtr-1];
259        n->fFirstPos = fNextIndex;              // move past the '='
260
261        // Push a new start-of-expression node; needed to keep parse of the
262        //   RHS expression happy.
263        pushNewNode(RBBINode::opStart);
264        break;
265
266
267
268
269    case doEndAssign:
270        {
271            // We have reached the end of an assignement statement.
272            //   Current scan char is the ';' that terminates the assignment.
273
274            // Terminate expression, leaves expression parse tree rooted in TOS node.
275            fixOpStack(RBBINode::precStart);
276
277            RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
278            RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
279            RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];
280
281            // Save original text of right side of assignment, excluding the terminating ';'
282            //  in the root of the node for the right-hand-side expression.
283            RHSExprNode->fFirstPos = startExprNode->fFirstPos;
284            RHSExprNode->fLastPos  = fScanIndex;
285            fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
286
287            // Expression parse tree becomes l. child of the $variable reference node.
288            varRefNode->fLeftChild = RHSExprNode;
289            RHSExprNode->fParent   = varRefNode;
290
291            // Make a symbol table entry for the $variableRef node.
292            fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
293            if (U_FAILURE(*fRB->fStatus)) {
294                // This is a round-about way to get the parse position set
295                //  so that duplicate symbols error messages include a line number.
296                UErrorCode t = *fRB->fStatus;
297                *fRB->fStatus = U_ZERO_ERROR;
298                error(t);
299            }
300
301            // Clean up the stack.
302            delete startExprNode;
303            fNodeStackPtr-=3;
304            break;
305        }
306
307    case doEndOfRule:
308        {
309        fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
310        if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
311            break;
312        }
313#ifdef RBBI_DEBUG
314        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
315#endif
316        U_ASSERT(fNodeStackPtr == 1);
317
318        // If this rule includes a look-ahead '/', add a endMark node to the
319        //   expression tree.
320        if (fLookAheadRule) {
321            RBBINode  *thisRule       = fNodeStack[fNodeStackPtr];
322            RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
323            RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
324            fNodeStackPtr -= 2;
325            catNode->fLeftChild       = thisRule;
326            catNode->fRightChild      = endNode;
327            fNodeStack[fNodeStackPtr] = catNode;
328            endNode->fVal             = fRuleNum;
329            endNode->fLookAheadEnd    = TRUE;
330        }
331
332        // All rule expressions are ORed together.
333        // The ';' that terminates an expression really just functions as a '|' with
334        //   a low operator prededence.
335        //
336        // Each of the four sets of rules are collected separately.
337        //  (forward, reverse, safe_forward, safe_reverse)
338        //  OR this rule into the appropriate group of them.
339        //
340        RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
341
342        if (*destRules != NULL) {
343            // This is not the first rule encounted.
344            // OR previous stuff  (from *destRules)
345            // with the current rule expression (on the Node Stack)
346            //  with the resulting OR expression going to *destRules
347            //
348            RBBINode  *thisRule    = fNodeStack[fNodeStackPtr];
349            RBBINode  *prevRules   = *destRules;
350            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
351            orNode->fLeftChild     = prevRules;
352            prevRules->fParent     = orNode;
353            orNode->fRightChild    = thisRule;
354            thisRule->fParent      = orNode;
355            *destRules             = orNode;
356        }
357        else
358        {
359            // This is the first rule encountered (for this direction).
360            // Just move its parse tree from the stack to *destRules.
361            *destRules = fNodeStack[fNodeStackPtr];
362        }
363        fReverseRule   = FALSE;   // in preparation for the next rule.
364        fLookAheadRule = FALSE;
365        fNodeStackPtr  = 0;
366        }
367        break;
368
369
370    case doRuleError:
371        error(U_BRK_RULE_SYNTAX);
372        returnVal = FALSE;
373        break;
374
375
376    case doVariableNameExpectedErr:
377        error(U_BRK_RULE_SYNTAX);
378        break;
379
380
381    //
382    //  Unary operands  + ? *
383    //    These all appear after the operand to which they apply.
384    //    When we hit one, the operand (may be a whole sub expression)
385    //    will be on the top of the stack.
386    //    Unary Operator becomes TOS, with the old TOS as its one child.
387    case doUnaryOpPlus:
388        {
389            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
390            RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
391            plusNode->fLeftChild   = operandNode;
392            operandNode->fParent   = plusNode;
393        }
394        break;
395
396    case doUnaryOpQuestion:
397        {
398            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
399            RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
400            qNode->fLeftChild      = operandNode;
401            operandNode->fParent   = qNode;
402        }
403        break;
404
405    case doUnaryOpStar:
406        {
407            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
408            RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
409            starNode->fLeftChild   = operandNode;
410            operandNode->fParent   = starNode;
411        }
412        break;
413
414    case doRuleChar:
415        // A "Rule Character" is any single character that is a literal part
416        // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
417        // These are pretty uncommon in break rules; the terms are more commonly
418        //  sets.  To keep things uniform, treat these characters like as
419        // sets that just happen to contain only one character.
420        {
421            n = pushNewNode(RBBINode::setRef);
422            findSetFor(fC.fChar, n);
423            n->fFirstPos = fScanIndex;
424            n->fLastPos  = fNextIndex;
425            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
426            break;
427        }
428
429    case doDotAny:
430        // scanned a ".", meaning match any single character.
431        {
432            n = pushNewNode(RBBINode::setRef);
433            findSetFor(kAny, n);
434            n->fFirstPos = fScanIndex;
435            n->fLastPos  = fNextIndex;
436            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
437            break;
438        }
439
440    case doSlash:
441        // Scanned a '/', which identifies a look-ahead break position in a rule.
442        n = pushNewNode(RBBINode::lookAhead);
443        n->fVal      = fRuleNum;
444        n->fFirstPos = fScanIndex;
445        n->fLastPos  = fNextIndex;
446        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
447        fLookAheadRule = TRUE;
448        break;
449
450
451    case doStartTagValue:
452        // Scanned a '{', the opening delimiter for a tag value within a rule.
453        n = pushNewNode(RBBINode::tag);
454        n->fVal      = 0;
455        n->fFirstPos = fScanIndex;
456        n->fLastPos  = fNextIndex;
457        break;
458
459    case doTagDigit:
460        // Just scanned a decimal digit that's part of a tag value
461        {
462            n = fNodeStack[fNodeStackPtr];
463            uint32_t v = u_charDigitValue(fC.fChar);
464            U_ASSERT(v < 10);
465            n->fVal = n->fVal*10 + v;
466            break;
467        }
468
469    case doTagValue:
470        n = fNodeStack[fNodeStackPtr];
471        n->fLastPos = fNextIndex;
472        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
473        break;
474
475    case doTagExpectedError:
476        error(U_BRK_MALFORMED_RULE_TAG);
477        returnVal = FALSE;
478        break;
479
480    case doOptionStart:
481        // Scanning a !!option.   At the start of string.
482        fOptionStart = fScanIndex;
483        break;
484
485    case doOptionEnd:
486        {
487            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
488            if (opt == UNICODE_STRING("chain", 5)) {
489                fRB->fChainRules = TRUE;
490            } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
491                fRB->fLBCMNoChain = TRUE;
492            } else if (opt == UNICODE_STRING("forward", 7)) {
493                fRB->fDefaultTree   = &fRB->fForwardTree;
494            } else if (opt == UNICODE_STRING("reverse", 7)) {
495                fRB->fDefaultTree   = &fRB->fReverseTree;
496            } else if (opt == UNICODE_STRING("safe_forward", 12)) {
497                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
498            } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
499                fRB->fDefaultTree   = &fRB->fSafeRevTree;
500            } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
501                fRB->fLookAheadHardBreak = TRUE;
502            } else {
503                error(U_BRK_UNRECOGNIZED_OPTION);
504            }
505        }
506        break;
507
508    case doReverseDir:
509        fReverseRule = TRUE;
510        break;
511
512    case doStartVariableName:
513        n = pushNewNode(RBBINode::varRef);
514        if (U_FAILURE(*fRB->fStatus)) {
515            break;
516        }
517        n->fFirstPos = fScanIndex;
518        break;
519
520    case doEndVariableName:
521        n = fNodeStack[fNodeStackPtr];
522        if (n==NULL || n->fType != RBBINode::varRef) {
523            error(U_BRK_INTERNAL_ERROR);
524            break;
525        }
526        n->fLastPos = fScanIndex;
527        fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
528        // Look the newly scanned name up in the symbol table
529        //   If there's an entry, set the l. child of the var ref to the replacement expression.
530        //   (We also pass through here when scanning assignments, but no harm is done, other
531        //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
532        n->fLeftChild = fSymbolTable->lookupNode(n->fText);
533        break;
534
535    case doCheckVarDef:
536        n = fNodeStack[fNodeStackPtr];
537        if (n->fLeftChild == NULL) {
538            error(U_BRK_UNDEFINED_VARIABLE);
539            returnVal = FALSE;
540        }
541        break;
542
543    case doExprFinished:
544        break;
545
546    case doRuleErrorAssignExpr:
547        error(U_BRK_ASSIGN_ERROR);
548        returnVal = FALSE;
549        break;
550
551    case doExit:
552        returnVal = FALSE;
553        break;
554
555    case doScanUnicodeSet:
556        scanSet();
557        break;
558
559    default:
560        error(U_BRK_INTERNAL_ERROR);
561        returnVal = FALSE;
562        break;
563    }
564    return returnVal;
565}
566
567
568
569
570//------------------------------------------------------------------------------
571//
572//  Error         Report a rule parse error.
573//                Only report it if no previous error has been recorded.
574//
575//------------------------------------------------------------------------------
576void RBBIRuleScanner::error(UErrorCode e) {
577    if (U_SUCCESS(*fRB->fStatus)) {
578        *fRB->fStatus = e;
579        if (fRB->fParseError) {
580            fRB->fParseError->line  = fLineNum;
581            fRB->fParseError->offset = fCharNum;
582            fRB->fParseError->preContext[0] = 0;
583            fRB->fParseError->preContext[0] = 0;
584        }
585    }
586}
587
588
589
590
591//------------------------------------------------------------------------------
592//
593//  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
594//               An entry on the stack may be as small as a single setRef node,
595//               or as large as the parse tree
596//               for an entire expression (this will be the one item left on the stack
597//               when the parsing of an RBBI rule completes.
598//
599//               This function is called when a binary operator is encountered.
600//               It looks back up the stack for operators that are not yet associated
601//               with a right operand, and if the precedence of the stacked operator >=
602//               the precedence of the current operator, binds the operand left,
603//               to the previously encountered operator.
604//
605//------------------------------------------------------------------------------
606void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
607    RBBINode *n;
608    // printNodeStack("entering fixOpStack()");
609    for (;;) {
610        n = fNodeStack[fNodeStackPtr-1];   // an operator node
611        if (n->fPrecedence == 0) {
612            RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
613            error(U_BRK_INTERNAL_ERROR);
614            return;
615        }
616
617        if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
618            // The most recent operand goes with the current operator,
619            //   not with the previously stacked one.
620            break;
621        }
622            // Stack operator is a binary op  ( '|' or concatenation)
623            //   TOS operand becomes right child of this operator.
624            //   Resulting subexpression becomes the TOS operand.
625            n->fRightChild = fNodeStack[fNodeStackPtr];
626            fNodeStack[fNodeStackPtr]->fParent = n;
627            fNodeStackPtr--;
628        // printNodeStack("looping in fixOpStack()   ");
629    }
630
631    if (p <= RBBINode::precLParen) {
632        // Scan is at a right paren or end of expression.
633        //  The scanned item must match the stack, or else there was an error.
634        //  Discard the left paren (or start expr) node from the stack,
635            //  leaving the completed (sub)expression as TOS.
636            if (n->fPrecedence != p) {
637                // Right paren encountered matched start of expression node, or
638                // end of expression matched with a left paren node.
639                error(U_BRK_MISMATCHED_PAREN);
640            }
641            fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
642            fNodeStackPtr--;
643            // Delete the now-discarded LParen or Start node.
644            delete n;
645    }
646    // printNodeStack("leaving fixOpStack()");
647}
648
649
650
651
652//------------------------------------------------------------------------------
653//
654//   findSetFor    given a UnicodeString,
655//                  - find the corresponding Unicode Set  (uset node)
656//                         (create one if necessary)
657//                  - Set fLeftChild of the caller's node (should be a setRef node)
658//                         to the uset node
659//                 Maintain a hash table of uset nodes, so the same one is always used
660//                    for the same string.
661//                 If a "to adopt" set is provided and we haven't seen this key before,
662//                    add the provided set to the hash table.
663//                 If the string is one (32 bit) char in length, the set contains
664//                    just one element which is the char in question.
665//                 If the string is "any", return a set containing all chars.
666//
667//------------------------------------------------------------------------------
668void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
669
670    RBBISetTableEl   *el;
671
672    // First check whether we've already cached a set for this string.
673    // If so, just use the cached set in the new node.
674    //   delete any set provided by the caller, since we own it.
675    el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
676    if (el != NULL) {
677        delete setToAdopt;
678        node->fLeftChild = el->val;
679        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
680        return;
681    }
682
683    // Haven't seen this set before.
684    // If the caller didn't provide us with a prebuilt set,
685    //   create a new UnicodeSet now.
686    if (setToAdopt == NULL) {
687        if (s.compare(kAny, -1) == 0) {
688            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
689        } else {
690            UChar32 c;
691            c = s.char32At(0);
692            setToAdopt = new UnicodeSet(c, c);
693        }
694    }
695
696    //
697    // Make a new uset node to refer to this UnicodeSet
698    // This new uset node becomes the child of the caller's setReference node.
699    //
700    RBBINode *usetNode    = new RBBINode(RBBINode::uset);
701    if (usetNode == NULL) {
702        error(U_MEMORY_ALLOCATION_ERROR);
703        return;
704    }
705    usetNode->fInputSet   = setToAdopt;
706    usetNode->fParent     = node;
707    node->fLeftChild      = usetNode;
708    usetNode->fText = s;
709
710
711    //
712    // Add the new uset node to the list of all uset nodes.
713    //
714    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
715
716
717    //
718    // Add the new set to the set hash table.
719    //
720    el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
721    UnicodeString *tkey = new UnicodeString(s);
722    if (tkey == NULL || el == NULL || setToAdopt == NULL) {
723        // Delete to avoid memory leak
724        delete tkey;
725        tkey = NULL;
726        uprv_free(el);
727        el = NULL;
728        delete setToAdopt;
729        setToAdopt = NULL;
730
731        error(U_MEMORY_ALLOCATION_ERROR);
732        return;
733    }
734    el->key = tkey;
735    el->val = usetNode;
736    uhash_put(fSetTable, el->key, el, fRB->fStatus);
737
738    return;
739}
740
741
742
743//
744//  Assorted Unicode character constants.
745//     Numeric because there is no portable way to enter them as literals.
746//     (Think EBCDIC).
747//
748static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
749static const UChar      chLF        = 0x0a;
750static const UChar      chNEL       = 0x85;      //    NEL newline variant
751static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
752static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.
753static const UChar      chPound     = 0x23;      // '#', introduces a comment.
754static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
755static const UChar      chLParen    = 0x28;
756static const UChar      chRParen    = 0x29;
757
758
759//------------------------------------------------------------------------------
760//
761//  stripRules    Return a rules string without unnecessary
762//                characters.
763//
764//------------------------------------------------------------------------------
765UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
766    UnicodeString strippedRules;
767    int rulesLength = rules.length();
768    for (int idx = 0; idx < rulesLength; ) {
769        UChar ch = rules[idx++];
770        if (ch == chPound) {
771            while (idx < rulesLength
772                && ch != chCR && ch != chLF && ch != chNEL)
773            {
774                ch = rules[idx++];
775            }
776        }
777        if (!u_isISOControl(ch)) {
778            strippedRules.append(ch);
779        }
780    }
781    // strippedRules = strippedRules.unescape();
782    return strippedRules;
783}
784
785
786//------------------------------------------------------------------------------
787//
788//  nextCharLL    Low Level Next Char from rule input source.
789//                Get a char from the input character iterator,
790//                keep track of input position for error reporting.
791//
792//------------------------------------------------------------------------------
793UChar32  RBBIRuleScanner::nextCharLL() {
794    UChar32  ch;
795
796    if (fNextIndex >= fRB->fRules.length()) {
797        return (UChar32)-1;
798    }
799    ch         = fRB->fRules.char32At(fNextIndex);
800    fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
801
802    if (ch == chCR ||
803        ch == chNEL ||
804        ch == chLS   ||
805        (ch == chLF && fLastChar != chCR)) {
806        // Character is starting a new line.  Bump up the line number, and
807        //  reset the column to 0.
808        fLineNum++;
809        fCharNum=0;
810        if (fQuoteMode) {
811            error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
812            fQuoteMode = FALSE;
813        }
814    }
815    else {
816        // Character is not starting a new line.  Except in the case of a
817        //   LF following a CR, increment the column position.
818        if (ch != chLF) {
819            fCharNum++;
820        }
821    }
822    fLastChar = ch;
823    return ch;
824}
825
826
827//------------------------------------------------------------------------------
828//
829//   nextChar     for rules scanning.  At this level, we handle stripping
830//                out comments and processing backslash character escapes.
831//                The rest of the rules grammar is handled at the next level up.
832//
833//------------------------------------------------------------------------------
834void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
835
836    // Unicode Character constants needed for the processing done by nextChar(),
837    //   in hex because literals wont work on EBCDIC machines.
838
839    fScanIndex = fNextIndex;
840    c.fChar    = nextCharLL();
841    c.fEscaped = FALSE;
842
843    //
844    //  check for '' sequence.
845    //  These are recognized in all contexts, whether in quoted text or not.
846    //
847    if (c.fChar == chApos) {
848        if (fRB->fRules.char32At(fNextIndex) == chApos) {
849            c.fChar    = nextCharLL();        // get nextChar officially so character counts
850            c.fEscaped = TRUE;                //   stay correct.
851        }
852        else
853        {
854            // Single quote, by itself.
855            //   Toggle quoting mode.
856            //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
857            fQuoteMode = !fQuoteMode;
858            if (fQuoteMode == TRUE) {
859                c.fChar = chLParen;
860            } else {
861                c.fChar = chRParen;
862            }
863            c.fEscaped = FALSE;      // The paren that we return is not escaped.
864            return;
865        }
866    }
867
868    if (fQuoteMode) {
869        c.fEscaped = TRUE;
870    }
871    else
872    {
873        // We are not in a 'quoted region' of the source.
874        //
875        if (c.fChar == chPound) {
876            // Start of a comment.  Consume the rest of it.
877            //  The new-line char that terminates the comment is always returned.
878            //  It will be treated as white-space, and serves to break up anything
879            //    that might otherwise incorrectly clump together with a comment in
880            //    the middle (a variable name, for example.)
881            for (;;) {
882                c.fChar = nextCharLL();
883                if (c.fChar == (UChar32)-1 ||  // EOF
884                    c.fChar == chCR     ||
885                    c.fChar == chLF     ||
886                    c.fChar == chNEL    ||
887                    c.fChar == chLS)       {break;}
888            }
889        }
890        if (c.fChar == (UChar32)-1) {
891            return;
892        }
893
894        //
895        //  check for backslash escaped characters.
896        //  Use UnicodeString::unescapeAt() to handle them.
897        //
898        if (c.fChar == chBackSlash) {
899            c.fEscaped = TRUE;
900            int32_t startX = fNextIndex;
901            c.fChar = fRB->fRules.unescapeAt(fNextIndex);
902            if (fNextIndex == startX) {
903                error(U_BRK_HEX_DIGITS_EXPECTED);
904            }
905            fCharNum += fNextIndex-startX;
906        }
907    }
908    // putc(c.fChar, stdout);
909}
910
911//------------------------------------------------------------------------------
912//
913//  Parse RBBI rules.   The state machine for rules parsing is here.
914//                      The state tables are hand-written in the file rbbirpt.txt,
915//                      and converted to the form used here by a perl
916//                      script rbbicst.pl
917//
918//------------------------------------------------------------------------------
919void RBBIRuleScanner::parse() {
920    uint16_t                state;
921    const RBBIRuleTableEl  *tableEl;
922
923    if (U_FAILURE(*fRB->fStatus)) {
924        return;
925    }
926
927    state = 1;
928    nextChar(fC);
929    //
930    // Main loop for the rule parsing state machine.
931    //   Runs once per state transition.
932    //   Each time through optionally performs, depending on the state table,
933    //      - an advance to the the next input char
934    //      - an action to be performed.
935    //      - pushing or popping a state to/from the local state return stack.
936    //
937    for (;;) {
938        //  Bail out if anything has gone wrong.
939        //  RBBI rule file parsing stops on the first error encountered.
940        if (U_FAILURE(*fRB->fStatus)) {
941            break;
942        }
943
944        // Quit if state == 0.  This is the normal way to exit the state machine.
945        //
946        if (state == 0) {
947            break;
948        }
949
950        // Find the state table element that matches the input char from the rule, or the
951        //    class of the input character.  Start with the first table row for this
952        //    state, then linearly scan forward until we find a row that matches the
953        //    character.  The last row for each state always matches all characters, so
954        //    the search will stop there, if not before.
955        //
956        tableEl = &gRuleParseStateTable[state];
957        #ifdef RBBI_DEBUG
958            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
959                RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
960                    fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
961            }
962        #endif
963
964        for (;;) {
965            #ifdef RBBI_DEBUG
966                if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
967            #endif
968            if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {
969                // Table row specified an individual character, not a set, and
970                //   the input character is not escaped, and
971                //   the input character matched it.
972                break;
973            }
974            if (tableEl->fCharClass == 255) {
975                // Table row specified default, match anything character class.
976                break;
977            }
978            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
979                // Table row specified "escaped" and the char was escaped.
980                break;
981            }
982            if (tableEl->fCharClass == 253 && fC.fEscaped &&
983                (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
984                // Table row specified "escaped P" and the char is either 'p' or 'P'.
985                break;
986            }
987            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
988                // Table row specified eof and we hit eof on the input.
989                break;
990            }
991
992            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
993                fC.fEscaped == FALSE &&                                      //   char is not escaped &&
994                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
995                if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
996                    // Table row specified a character class, or set of characters,
997                    //   and the current char matches it.
998                    break;
999                }
1000            }
1001
1002            // No match on this row, advance to the next  row for this state,
1003            tableEl++;
1004        }
1005        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1006
1007        //
1008        // We've found the row of the state table that matches the current input
1009        //   character from the rules string.
1010        // Perform any action specified  by this row in the state table.
1011        if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
1012            // Break out of the state machine loop if the
1013            //   the action signalled some kind of error, or
1014            //   the action was to exit, occurs on normal end-of-rules-input.
1015            break;
1016        }
1017
1018        if (tableEl->fPushState != 0) {
1019            fStackPtr++;
1020            if (fStackPtr >= kStackSize) {
1021                error(U_BRK_INTERNAL_ERROR);
1022                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1023                fStackPtr--;
1024            }
1025            fStack[fStackPtr] = tableEl->fPushState;
1026        }
1027
1028        if (tableEl->fNextChar) {
1029            nextChar(fC);
1030        }
1031
1032        // Get the next state from the table entry, or from the
1033        //   state stack if the next state was specified as "pop".
1034        if (tableEl->fNextState != 255) {
1035            state = tableEl->fNextState;
1036        } else {
1037            state = fStack[fStackPtr];
1038            fStackPtr--;
1039            if (fStackPtr < 0) {
1040                error(U_BRK_INTERNAL_ERROR);
1041                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1042                fStackPtr++;
1043            }
1044        }
1045
1046    }
1047
1048    //
1049    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
1050    //
1051    if (fRB->fReverseTree == NULL) {
1052        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
1053        RBBINode  *operand = pushNewNode(RBBINode::setRef);
1054        findSetFor(kAny, operand);
1055        fRB->fReverseTree->fLeftChild = operand;
1056        operand->fParent              = fRB->fReverseTree;
1057        fNodeStackPtr -= 2;
1058    }
1059
1060
1061    //
1062    // Parsing of the input RBBI rules is complete.
1063    // We now have a parse tree for the rule expressions
1064    // and a list of all UnicodeSets that are referenced.
1065    //
1066#ifdef RBBI_DEBUG
1067    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1068    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
1069    {
1070        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1071        fRB->fForwardTree->printTree(TRUE);
1072        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1073        fRB->fReverseTree->printTree(TRUE);
1074        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1075        fRB->fSafeFwdTree->printTree(TRUE);
1076        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1077        fRB->fSafeRevTree->printTree(TRUE);
1078    }
1079#endif
1080}
1081
1082
1083//------------------------------------------------------------------------------
1084//
1085//  printNodeStack     for debugging...
1086//
1087//------------------------------------------------------------------------------
1088#ifdef RBBI_DEBUG
1089void RBBIRuleScanner::printNodeStack(const char *title) {
1090    int i;
1091    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
1092    for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
1093}
1094#endif
1095
1096
1097
1098
1099//------------------------------------------------------------------------------
1100//
1101//  pushNewNode   create a new RBBINode of the specified type and push it
1102//                onto the stack of nodes.
1103//
1104//------------------------------------------------------------------------------
1105RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
1106    fNodeStackPtr++;
1107    if (fNodeStackPtr >= kStackSize) {
1108        error(U_BRK_INTERNAL_ERROR);
1109        RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1110        *fRB->fStatus = U_BRK_INTERNAL_ERROR;
1111        return NULL;
1112    }
1113    fNodeStack[fNodeStackPtr] = new RBBINode(t);
1114    if (fNodeStack[fNodeStackPtr] == NULL) {
1115        *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1116    }
1117    return fNodeStack[fNodeStackPtr];
1118}
1119
1120
1121
1122//------------------------------------------------------------------------------
1123//
1124//  scanSet    Construct a UnicodeSet from the text at the current scan
1125//             position.  Advance the scan position to the first character
1126//             after the set.
1127//
1128//             A new RBBI setref node referring to the set is pushed onto the node
1129//             stack.
1130//
1131//             The scan position is normally under the control of the state machine
1132//             that controls rule parsing.  UnicodeSets, however, are parsed by
1133//             the UnicodeSet constructor, not by the RBBI rule parser.
1134//
1135//------------------------------------------------------------------------------
1136void RBBIRuleScanner::scanSet() {
1137    UnicodeSet    *uset;
1138    ParsePosition  pos;
1139    int            startPos;
1140    int            i;
1141
1142    if (U_FAILURE(*fRB->fStatus)) {
1143        return;
1144    }
1145
1146    pos.setIndex(fScanIndex);
1147    startPos = fScanIndex;
1148    UErrorCode localStatus = U_ZERO_ERROR;
1149    uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE,
1150                         fSymbolTable,
1151                         localStatus);
1152    if (uset == NULL) {
1153        localStatus = U_MEMORY_ALLOCATION_ERROR;
1154    }
1155    if (U_FAILURE(localStatus)) {
1156        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
1157        //         UnicodeSet appears to not be reporting correctly at this time.
1158        #ifdef RBBI_DEBUG
1159            RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
1160        #endif
1161        error(localStatus);
1162        delete uset;
1163        return;
1164    }
1165
1166    // Verify that the set contains at least one code point.
1167    //
1168    if (uset->isEmpty()) {
1169        // This set is empty.
1170        //  Make it an error, because it almost certainly is not what the user wanted.
1171        //  Also, avoids having to think about corner cases in the tree manipulation code
1172        //   that occurs later on.
1173        error(U_BRK_RULE_EMPTY_SET);
1174        delete uset;
1175        return;
1176    }
1177
1178
1179    // Advance the RBBI parse postion over the UnicodeSet pattern.
1180    //   Don't just set fScanIndex because the line/char positions maintained
1181    //   for error reporting would be thrown off.
1182    i = pos.getIndex();
1183    for (;;) {
1184        if (fNextIndex >= i) {
1185            break;
1186        }
1187        nextCharLL();
1188    }
1189
1190    if (U_SUCCESS(*fRB->fStatus)) {
1191        RBBINode         *n;
1192
1193        n = pushNewNode(RBBINode::setRef);
1194        n->fFirstPos = startPos;
1195        n->fLastPos  = fNextIndex;
1196        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1197        //  findSetFor() serves several purposes here:
1198        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1199        //     - Mantains collection of all sets in use, needed later for establishing
1200        //          character categories for run time engine.
1201        //     - Eliminates mulitiple instances of the same set.
1202        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1203        findSetFor(n->fText, n, uset);
1204    }
1205
1206}
1207
1208U_NAMESPACE_END
1209
1210#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1211