/* * [The "BSD license"] * Copyright (c) 2010 Terence Parr * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.antlr.tool; import org.antlr.analysis.*; import org.antlr.misc.IntSet; import org.antlr.misc.IntervalSet; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; /** Routines to construct StateClusters from EBNF grammar constructs. * No optimization is done to remove unnecessary epsilon edges. * * TODO: add an optimization that reduces number of states and transitions * will help with speed of conversion and make it easier to view NFA. For * example, o-A->o-->o-B->o should be o-A->o-B->o */ public class NFAFactory { /** This factory is attached to a specifc NFA that it is building. * The NFA will be filled up with states and transitions. */ NFA nfa = null; public Rule getCurrentRule() { return currentRule; } public void setCurrentRule(Rule currentRule) { this.currentRule = currentRule; } Rule currentRule = null; public NFAFactory(NFA nfa) { nfa.setFactory(this); this.nfa = nfa; } public NFAState newState() { NFAState n = new NFAState(nfa); int state = nfa.getNewNFAStateNumber(); n.stateNumber = state; nfa.addState(n); n.enclosingRule = currentRule; return n; } /** Optimize an alternative (list of grammar elements). * * Walk the chain of elements (which can be complicated loop blocks...) * and throw away any epsilon transitions used to link up simple elements. * * This only removes 195 states from the java.g's NFA, but every little * bit helps. Perhaps I can improve in the future. */ public void optimizeAlternative(StateCluster alt) { NFAState s = alt.left; while ( s!=alt.right ) { // if it's a block element, jump over it and continue if ( s.endOfBlockStateNumber!=State.INVALID_STATE_NUMBER ) { s = nfa.getState(s.endOfBlockStateNumber); continue; } Transition t = s.transition[0]; if ( t instanceof RuleClosureTransition ) { s = ((RuleClosureTransition) t).followState; continue; } if ( t.label.isEpsilon() && !t.label.isAction() && s.getNumberOfTransitions()==1 ) { // bypass epsilon transition and point to what the epsilon's // target points to unless that epsilon transition points to // a block or loop etc.. Also don't collapse epsilons that // point at the last node of the alt. Don't collapse action edges NFAState epsilonTarget = (NFAState)t.target; if ( epsilonTarget.endOfBlockStateNumber==State.INVALID_STATE_NUMBER && epsilonTarget.transition[0] !=null ) { s.setTransition0(epsilonTarget.transition[0]); /* System.out.println("### opt "+s.stateNumber+"->"+ epsilonTarget.transition(0).target.stateNumber); */ } } s = (NFAState)t.target; } } /** From label A build Graph o-A->o */ public StateCluster build_Atom(int label, GrammarAST associatedAST) { NFAState left = newState(); NFAState right = newState(); left.associatedASTNode = associatedAST; right.associatedASTNode = associatedAST; transitionBetweenStates(left, right, label); StateCluster g = new StateCluster(left, right); return g; } public StateCluster build_Atom(GrammarAST atomAST) { int tokenType = nfa.grammar.getTokenType(atomAST.getText()); return build_Atom(tokenType, atomAST); } /** From set build single edge graph o->o-set->o. To conform to * what an alt block looks like, must have extra state on left. */ public StateCluster build_Set(IntSet set, GrammarAST associatedAST) { NFAState left = newState(); NFAState right = newState(); left.associatedASTNode = associatedAST; right.associatedASTNode = associatedAST; Label label = new Label(set); Transition e = new Transition(label,right); left.addTransition(e); StateCluster g = new StateCluster(left, right); return g; } /** Can only complement block of simple alts; can complement build_Set() * result, that is. Get set and complement, replace old with complement. public StateCluster build_AlternativeBlockComplement(StateCluster blk) { State s0 = blk.left; IntSet set = getCollapsedBlockAsSet(s0); if ( set!=null ) { // if set is available, then structure known and blk is a set set = nfa.grammar.complement(set); Label label = s0.transition(0).target.transition(0).label; label.setSet(set); } return blk; } */ public StateCluster build_Range(int a, int b) { NFAState left = newState(); NFAState right = newState(); Label label = new Label(IntervalSet.of(a, b)); Transition e = new Transition(label,right); left.addTransition(e); StateCluster g = new StateCluster(left, right); return g; } /** From char 'c' build StateCluster o-intValue(c)->o */ public StateCluster build_CharLiteralAtom(GrammarAST charLiteralAST) { int c = Grammar.getCharValueFromGrammarCharLiteral(charLiteralAST.getText()); return build_Atom(c, charLiteralAST); } /** From char 'c' build StateCluster o-intValue(c)->o * can include unicode spec likes '\u0024' later. Accepts * actual unicode 16-bit now, of course, by default. * TODO not supplemental char clean! */ public StateCluster build_CharRange(String a, String b) { int from = Grammar.getCharValueFromGrammarCharLiteral(a); int to = Grammar.getCharValueFromGrammarCharLiteral(b); return build_Range(from, to); } /** For a non-lexer, just build a simple token reference atom. * For a lexer, a string is a sequence of char to match. That is, * "fog" is treated as 'f' 'o' 'g' not as a single transition in * the DFA. Machine== o-'f'->o-'o'->o-'g'->o and has n+1 states * for n characters. */ public StateCluster build_StringLiteralAtom(GrammarAST stringLiteralAST) { if ( nfa.grammar.type==Grammar.LEXER ) { StringBuffer chars = Grammar.getUnescapedStringFromGrammarStringLiteral(stringLiteralAST.getText()); NFAState first = newState(); NFAState last = null; NFAState prev = first; for (int i=0; i(r) o * * where (r) is the start of rule r and the trailing o is not linked * to from rule ref state directly (it's done thru the transition(0) * RuleClosureTransition. * * If the rule r is just a list of tokens, it's block will be just * a set on an edge o->o->o-set->o->o->o, could inline it rather than doing * the rule reference, but i'm not doing this yet as I'm not sure * it would help much in the NFA->DFA construction. * * TODO add to codegen: collapse alt blks that are sets into single matchSet */ public StateCluster build_RuleRef(Rule refDef, NFAState ruleStart) { //System.out.println("building ref to rule "+nfa.grammar.name+"."+refDef.name); NFAState left = newState(); // left.setDescription("ref to "+ruleStart.getDescription()); NFAState right = newState(); // right.setDescription("NFAState following ref to "+ruleStart.getDescription()); Transition e = new RuleClosureTransition(refDef,ruleStart,right); left.addTransition(e); StateCluster g = new StateCluster(left, right); return g; } /** From an empty alternative build StateCluster o-e->o */ public StateCluster build_Epsilon() { NFAState left = newState(); NFAState right = newState(); transitionBetweenStates(left, right, Label.EPSILON); StateCluster g = new StateCluster(left, right); return g; } /** Build what amounts to an epsilon transition with a semantic * predicate action. The pred is a pointer into the AST of * the SEMPRED token. */ public StateCluster build_SemanticPredicate(GrammarAST pred) { // don't count syn preds if ( !pred.getText().toUpperCase() .startsWith(Grammar.SYNPRED_RULE_PREFIX.toUpperCase()) ) { nfa.grammar.numberOfSemanticPredicates++; } NFAState left = newState(); NFAState right = newState(); Transition e = new Transition(new PredicateLabel(pred), right); left.addTransition(e); StateCluster g = new StateCluster(left, right); return g; } /** Build what amounts to an epsilon transition with an action. * The action goes into NFA though it is ignored during analysis. * It slows things down a bit, but I must ignore predicates after * having seen an action (5-5-2008). */ public StateCluster build_Action(GrammarAST action) { NFAState left = newState(); NFAState right = newState(); Transition e = new Transition(new ActionLabel(action), right); left.addTransition(e); return new StateCluster(left, right); } /** add an EOF transition to any rule end NFAState that points to nothing * (i.e., for all those rules not invoked by another rule). These * are start symbols then. * * Return the number of grammar entry points; i.e., how many rules are * not invoked by another rule (they can only be invoked from outside). * These are the start rules. */ public int build_EOFStates(Collection rules) { int numberUnInvokedRules = 0; for (Iterator iterator = rules.iterator(); iterator.hasNext();) { Rule r = (Rule) iterator.next(); NFAState endNFAState = r.stopState; // Is this rule a start symbol? (no follow links) if ( endNFAState.transition[0] ==null ) { // if so, then don't let algorithm fall off the end of // the rule, make it hit EOF/EOT. build_EOFState(endNFAState); // track how many rules have been invoked by another rule numberUnInvokedRules++; } } return numberUnInvokedRules; } /** set up an NFA NFAState that will yield eof tokens or, * in the case of a lexer grammar, an EOT token when the conversion * hits the end of a rule. */ private void build_EOFState(NFAState endNFAState) { NFAState end = newState(); int label = Label.EOF; if ( nfa.grammar.type==Grammar.LEXER ) { label = Label.EOT; end.setEOTTargetState(true); } /* System.out.println("build "+nfa.grammar.getTokenDisplayName(label)+ " loop on end of state "+endNFAState.getDescription()+ " to state "+end.stateNumber); */ Transition toEnd = new Transition(label, end); endNFAState.addTransition(toEnd); } /** From A B build A-e->B (that is, build an epsilon arc from right * of A to left of B). * * As a convenience, return B if A is null or return A if B is null. */ public StateCluster build_AB(StateCluster A, StateCluster B) { if ( A==null ) { return B; } if ( B==null ) { return A; } transitionBetweenStates(A.right, B.left, Label.EPSILON); StateCluster g = new StateCluster(A.left, B.right); return g; } /** From a set ('a'|'b') build * * o->o-'a'..'b'->o->o (last NFAState is blockEndNFAState pointed to by all alts) */ public StateCluster build_AlternativeBlockFromSet(StateCluster set) { if ( set==null ) { return null; } // single alt, no decision, just return only alt state cluster NFAState startOfAlt = newState(); // must have this no matter what transitionBetweenStates(startOfAlt, set.left, Label.EPSILON); return new StateCluster(startOfAlt,set.right); } /** From A|B|..|Z alternative block build * * o->o-A->o->o (last NFAState is blockEndNFAState pointed to by all alts) * | ^ * o->o-B->o--| * | | * ... | * | | * o->o-Z->o--| * * So every alternative gets begin NFAState connected by epsilon * and every alt right side points at a block end NFAState. There is a * new NFAState in the NFAState in the StateCluster for each alt plus one for the * end NFAState. * * Special case: only one alternative: don't make a block with alt * begin/end. * * Special case: if just a list of tokens/chars/sets, then collapse * to a single edge'd o-set->o graph. * * Set alt number (1..n) in the left-Transition NFAState. */ public StateCluster build_AlternativeBlock(List alternativeStateClusters) { StateCluster result = null; if ( alternativeStateClusters==null || alternativeStateClusters.size()==0 ) { return null; } // single alt case if ( alternativeStateClusters.size()==1 ) { // single alt, no decision, just return only alt state cluster StateCluster g = (StateCluster)alternativeStateClusters.get(0); NFAState startOfAlt = newState(); // must have this no matter what transitionBetweenStates(startOfAlt, g.left, Label.EPSILON); //System.out.println("### opt saved start/stop end in (...)"); return new StateCluster(startOfAlt,g.right); } // even if we can collapse for lookahead purposes, we will still // need to predict the alts of this subrule in case there are actions // etc... This is the decision that is pointed to from the AST node // (always) NFAState prevAlternative = null; // tracks prev so we can link to next alt NFAState firstAlt = null; NFAState blockEndNFAState = newState(); blockEndNFAState.setDescription("end block"); int altNum = 1; for (Iterator iter = alternativeStateClusters.iterator(); iter.hasNext();) { StateCluster g = (StateCluster) iter.next(); // add begin NFAState for this alt connected by epsilon NFAState left = newState(); left.setDescription("alt "+altNum+" of ()"); transitionBetweenStates(left, g.left, Label.EPSILON); transitionBetweenStates(g.right, blockEndNFAState, Label.EPSILON); // Are we the first alternative? if ( firstAlt==null ) { firstAlt = left; // track extreme left node of StateCluster } else { // if not first alternative, must link to this alt from previous transitionBetweenStates(prevAlternative, left, Label.EPSILON); } prevAlternative = left; altNum++; } // return StateCluster pointing representing entire block // Points to first alt NFAState on left, block end on right result = new StateCluster(firstAlt, blockEndNFAState); firstAlt.decisionStateType = NFAState.BLOCK_START; // set EOB markers for Jean firstAlt.endOfBlockStateNumber = blockEndNFAState.stateNumber; return result; } /** From (A)? build either: * * o--A->o * | ^ * o---->| * * or, if A is a block, just add an empty alt to the end of the block */ public StateCluster build_Aoptional(StateCluster A) { StateCluster g = null; int n = nfa.grammar.getNumberOfAltsForDecisionNFA(A.left); if ( n==1 ) { // no decision, just wrap in an optional path //NFAState decisionState = newState(); NFAState decisionState = A.left; // resuse left edge decisionState.setDescription("only alt of ()? block"); NFAState emptyAlt = newState(); emptyAlt.setDescription("epsilon path of ()? block"); NFAState blockEndNFAState = null; blockEndNFAState = newState(); transitionBetweenStates(A.right, blockEndNFAState, Label.EPSILON); blockEndNFAState.setDescription("end ()? block"); //transitionBetweenStates(decisionState, A.left, Label.EPSILON); transitionBetweenStates(decisionState, emptyAlt, Label.EPSILON); transitionBetweenStates(emptyAlt, blockEndNFAState, Label.EPSILON); // set EOB markers for Jean decisionState.endOfBlockStateNumber = blockEndNFAState.stateNumber; blockEndNFAState.decisionStateType = NFAState.RIGHT_EDGE_OF_BLOCK; g = new StateCluster(decisionState, blockEndNFAState); } else { // a decision block, add an empty alt NFAState lastRealAlt = nfa.grammar.getNFAStateForAltOfDecision(A.left, n); NFAState emptyAlt = newState(); emptyAlt.setDescription("epsilon path of ()? block"); transitionBetweenStates(lastRealAlt, emptyAlt, Label.EPSILON); transitionBetweenStates(emptyAlt, A.right, Label.EPSILON); // set EOB markers for Jean (I think this is redundant here) A.left.endOfBlockStateNumber = A.right.stateNumber; A.right.decisionStateType = NFAState.RIGHT_EDGE_OF_BLOCK; g = A; // return same block, but now with optional last path } g.left.decisionStateType = NFAState.OPTIONAL_BLOCK_START; return g; } /** From (A)+ build * * |---| (Transition 2 from A.right points at alt 1) * v | (follow of loop is Transition 1) * o->o-A-o->o * * Meaning that the last NFAState in A points back to A's left Transition NFAState * and we add a new begin/end NFAState. A can be single alternative or * multiple. * * During analysis we'll call the follow link (transition 1) alt n+1 for * an n-alt A block. */ public StateCluster build_Aplus(StateCluster A) { NFAState left = newState(); NFAState blockEndNFAState = newState(); blockEndNFAState.decisionStateType = NFAState.RIGHT_EDGE_OF_BLOCK; // don't reuse A.right as loopback if it's right edge of another block if ( A.right.decisionStateType == NFAState.RIGHT_EDGE_OF_BLOCK ) { // nested A* so make another tail node to be the loop back // instead of the usual A.right which is the EOB for inner loop NFAState extraRightEdge = newState(); transitionBetweenStates(A.right, extraRightEdge, Label.EPSILON); A.right = extraRightEdge; } transitionBetweenStates(A.right, blockEndNFAState, Label.EPSILON); // follow is Transition 1 // turn A's block end into a loopback (acts like alt 2) transitionBetweenStates(A.right, A.left, Label.EPSILON); // loop back Transition 2 transitionBetweenStates(left, A.left, Label.EPSILON); A.right.decisionStateType = NFAState.LOOPBACK; A.left.decisionStateType = NFAState.BLOCK_START; // set EOB markers for Jean A.left.endOfBlockStateNumber = A.right.stateNumber; StateCluster g = new StateCluster(left, blockEndNFAState); return g; } /** From (A)* build * * |---| * v | * o->o-A-o--o (Transition 2 from block end points at alt 1; follow is Transition 1) * | ^ * o---------| (optional branch is 2nd alt of optional block containing A+) * * Meaning that the last (end) NFAState in A points back to A's * left side NFAState and we add 3 new NFAStates (the * optional branch is built just like an optional subrule). * See the Aplus() method for more on the loop back Transition. * The new node on right edge is set to RIGHT_EDGE_OF_CLOSURE so we * can detect nested (A*)* loops and insert an extra node. Previously, * two blocks shared same EOB node. * * There are 2 or 3 decision points in a A*. If A is not a block (i.e., * it only has one alt), then there are two decisions: the optional bypass * and then loopback. If A is a block of alts, then there are three * decisions: bypass, loopback, and A's decision point. * * Note that the optional bypass must be outside the loop as (A|B)* is * not the same thing as (A|B|)+. * * This is an accurate NFA representation of the meaning of (A)*, but * for generating code, I don't need a DFA for the optional branch by * virtue of how I generate code. The exit-loopback-branch decision * is sufficient to let me make an appropriate enter, exit, loop * determination. See codegen.g */ public StateCluster build_Astar(StateCluster A) { NFAState bypassDecisionState = newState(); bypassDecisionState.setDescription("enter loop path of ()* block"); NFAState optionalAlt = newState(); optionalAlt.setDescription("epsilon path of ()* block"); NFAState blockEndNFAState = newState(); blockEndNFAState.decisionStateType = NFAState.RIGHT_EDGE_OF_BLOCK; // don't reuse A.right as loopback if it's right edge of another block if ( A.right.decisionStateType == NFAState.RIGHT_EDGE_OF_BLOCK ) { // nested A* so make another tail node to be the loop back // instead of the usual A.right which is the EOB for inner loop NFAState extraRightEdge = newState(); transitionBetweenStates(A.right, extraRightEdge, Label.EPSILON); A.right = extraRightEdge; } // convert A's end block to loopback A.right.setDescription("()* loopback"); // Transition 1 to actual block of stuff transitionBetweenStates(bypassDecisionState, A.left, Label.EPSILON); // Transition 2 optional to bypass transitionBetweenStates(bypassDecisionState, optionalAlt, Label.EPSILON); transitionBetweenStates(optionalAlt, blockEndNFAState, Label.EPSILON); // Transition 1 of end block exits transitionBetweenStates(A.right, blockEndNFAState, Label.EPSILON); // Transition 2 of end block loops transitionBetweenStates(A.right, A.left, Label.EPSILON); bypassDecisionState.decisionStateType = NFAState.BYPASS; A.left.decisionStateType = NFAState.BLOCK_START; A.right.decisionStateType = NFAState.LOOPBACK; // set EOB markers for Jean A.left.endOfBlockStateNumber = A.right.stateNumber; bypassDecisionState.endOfBlockStateNumber = blockEndNFAState.stateNumber; StateCluster g = new StateCluster(bypassDecisionState, blockEndNFAState); return g; } /** Build an NFA predictor for special rule called Tokens manually that * predicts which token will succeed. The refs to the rules are not * RuleRefTransitions as I want DFA conversion to stop at the EOT * transition on the end of each token, rather than return to Tokens rule. * If I used normal build_alternativeBlock for this, the RuleRefTransitions * would save return address when jumping away from Tokens rule. * * All I do here is build n new states for n rules with an epsilon * edge to the rule start states and then to the next state in the * list: * * o->(A) (a state links to start of A and to next in list) * | * o->(B) * | * ... * | * o->(Z) * * This is the NFA created for the artificial rule created in * Grammar.addArtificialMatchTokensRule(). * * 11/28/2005: removed so we can use normal rule construction for Tokens. public NFAState build_ArtificialMatchTokensRuleNFA() { int altNum = 1; NFAState firstAlt = null; // the start state for the "rule" NFAState prevAlternative = null; Iterator iter = nfa.grammar.getRules().iterator(); // TODO: add a single decision node/state for good description while (iter.hasNext()) { Rule r = (Rule) iter.next(); String ruleName = r.name; String modifier = nfa.grammar.getRuleModifier(ruleName); if ( ruleName.equals(Grammar.ARTIFICIAL_TOKENS_RULENAME) || (modifier!=null && modifier.equals(Grammar.FRAGMENT_RULE_MODIFIER)) ) { continue; // don't loop to yourself or do nontoken rules } NFAState ruleStartState = nfa.grammar.getRuleStartState(ruleName); NFAState left = newState(); left.setDescription("alt "+altNum+" of artificial rule "+Grammar.ARTIFICIAL_TOKENS_RULENAME); transitionBetweenStates(left, ruleStartState, Label.EPSILON); // Are we the first alternative? if ( firstAlt==null ) { firstAlt = left; // track extreme top left node as rule start } else { // if not first alternative, must link to this alt from previous transitionBetweenStates(prevAlternative, left, Label.EPSILON); } prevAlternative = left; altNum++; } firstAlt.decisionStateType = NFAState.BLOCK_START; return firstAlt; } */ /** Build an atom with all possible values in its label */ public StateCluster build_Wildcard(GrammarAST associatedAST) { NFAState left = newState(); NFAState right = newState(); left.associatedASTNode = associatedAST; right.associatedASTNode = associatedAST; Label label = new Label(nfa.grammar.getTokenTypes()); // char or tokens Transition e = new Transition(label,right); left.addTransition(e); StateCluster g = new StateCluster(left, right); return g; } /** Build a subrule matching ^(. .*) (any tree or node). Let's use * (^(. .+) | .) to be safe. */ public StateCluster build_WildcardTree(GrammarAST associatedAST) { StateCluster wildRoot = build_Wildcard(associatedAST); StateCluster down = build_Atom(Label.DOWN, associatedAST); wildRoot = build_AB(wildRoot,down); // hook in; . DOWN // make .+ StateCluster wildChildren = build_Wildcard(associatedAST); wildChildren = build_Aplus(wildChildren); wildRoot = build_AB(wildRoot,wildChildren); // hook in; . DOWN .+ StateCluster up = build_Atom(Label.UP, associatedAST); wildRoot = build_AB(wildRoot,up); // hook in; . DOWN .+ UP // make optional . alt StateCluster optionalNodeAlt = build_Wildcard(associatedAST); List alts = new ArrayList(); alts.add(wildRoot); alts.add(optionalNodeAlt); StateCluster blk = build_AlternativeBlock(alts); return blk; } /** Given a collapsed block of alts (a set of atoms), pull out * the set and return it. */ protected IntSet getCollapsedBlockAsSet(State blk) { State s0 = blk; if ( s0!=null && s0.transition(0)!=null ) { State s1 = s0.transition(0).target; if ( s1!=null && s1.transition(0)!=null ) { Label label = s1.transition(0).label; if ( label.isSet() ) { return label.getSet(); } } } return null; } private void transitionBetweenStates(NFAState a, NFAState b, int label) { Transition e = new Transition(label,b); a.addTransition(e); } }