1#!/usr/bin/ruby
2# encoding: utf-8
3
4=begin LICENSE
5
6[The "BSD licence"]
7Copyright (c) 2009-2010 Kyle Yetter
8All rights reserved.
9
10Redistribution and use in source and binary forms, with or without
11modification, are permitted provided that the following conditions
12are met:
13
14 1. Redistributions of source code must retain the above copyright
15    notice, this list of conditions and the following disclaimer.
16 2. Redistributions in binary form must reproduce the above copyright
17    notice, this list of conditions and the following disclaimer in the
18    documentation and/or other materials provided with the distribution.
19 3. The name of the author may not be used to endorse or promote products
20    derived from this software without specific prior written permission.
21
22THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33=end
34
35module ANTLR3
36unless const_defined?( :RecognizerSharedState )
37
38RecognizerSharedState = Struct.new( 
39  :following,
40  :error_recovery,
41  :last_error_index,
42  :backtracking,
43  :rule_memory,
44  :syntax_errors,
45  :token,
46  :token_start_position,
47  :token_start_line,
48  :token_start_column,
49  :channel,
50  :type,
51  :text
52)
53
54=begin rdoc ANTLR3::RecognizerSharedState
55
56A big Struct-based class containing most of the data that makes up a
57recognizer's state. These attributes are externalized from the recognizer itself
58so that recognizer delegation (which occurs when you import other grammars into
59your grammar) can function; multiple recognizers can share a common state.
60
61== Structure Attributes
62
63following::
64  a stack that tracks follow sets for error recovery
65error_recovery::
66  a flag indicating whether or not the recognizer is in error recovery mode
67last_error_index::
68  the index in the input stream of the last error
69backtracking::
70  tracks the backtracking depth
71rule_memory::
72  if a grammar is compiled with the memoization option, this will be 
73  set to a hash mapping previously parsed rules to cached indices
74syntax_errors::
75  tracks the number of syntax errors seen so far
76token::
77  holds newly constructed tokens for lexer rules
78token_start_position::
79  the input stream index at which the token starts
80token_start_line::
81  the input stream line number at which the token starts
82token_start_column::
83  the input stream column at which the token starts
84channel::
85  the channel value of the target token
86type::
87  the type value of the target token
88text::
89  the text of the target token
90
91=end
92
93class RecognizerSharedState
94  def initialize
95    super( [], false, -1, 0, nil, 0, nil, -1 )
96    # ^-- same as this --v 
97    # self.following = []
98    # self.error_recovery = false
99    # self.last_error_index = -1
100    # self.backtracking = 0
101    # self.syntax_errors = 0
102    # self.token_start_position = -1
103  end
104  
105  
106  # restores all of the state variables to their respective
107  # initial default values
108  def reset!
109    self.following.clear
110    self.error_recovery = false
111    self.last_error_index = -1
112    self.backtracking = 0
113    self.rule_memory and rule_memory.clear
114    self.syntax_errors = 0
115    self.token = nil
116    self.token_start_position = -1
117    self.token_start_line = nil
118    self.token_start_column = nil
119    self.channel = nil
120    self.type = nil
121    self.text = nil
122  end
123end
124
125end # unless const_defined?( :RecognizerSharedState )
126
127=begin rdoc ANTLR3::Recognizer
128
129= Scope
130
131Scope is used to represent instances of ANTLR's various attribute scopes.
132It is identical to Ruby's built-in Struct class, but it takes string
133attribute declarations from the ANTLR grammar as parameters, and overrides
134the #initialize method to set the default values if any are present in
135the scope declaration.
136
137  Block = Scope.new( "name", "depth = 0", "variables = {}" )
138  Block.new                    # => #<struct Block name=nil, depth=0, variables={}>
139  Block.new( "function" )      # => #<struct Block name="function", depth=0, variables={}>
140  Block.new( 'a', 1, :x => 3 ) # => #<struct Block name="a", depth=1, variables={ :x => 3 }>
141
142=end
143
144class Scope < ::Struct
145  def self.new( *declarations, &body )
146    names = []
147    defaults = {}
148    for decl in declarations
149      name, default = decl.to_s.split( /\s*=\s*/, 2 )
150      names << ( name = name.to_sym )
151      default and defaults[ name ] = default
152    end
153    super( *names ) do
154      
155      # If no defaults, leave the initialize method the same as
156      # the struct's default initialize for speed. Otherwise,
157      # overwrite the initialize to populate with default values.
158      unless defaults.empty?
159        parameters = names.map do | name |
160          "#{ name } = " << defaults.fetch( name, 'nil' )
161        end.join( ', ' )
162        class_eval( <<-END )
163          def initialize( #{ parameters } )
164            super( #{ names.join( ', ' ) } )
165          end
166        END
167      end
168      
169      body and class_eval( &body )
170    end
171  end
172end
173
174=begin rdoc ANTLR3::Recognizer
175
176= Recognizer
177
178As the base class of all ANTLR-generated recognizers, Recognizer provides
179much of the shared functionality and structure used in the recognition process.
180For all effective purposes, the class and its immediate subclasses Lexer,
181Parser, and TreeParser are abstract classes. They can be instantiated, but
182they're pretty useless on their own. Instead, to make useful code, you write an
183ANTLR grammar and ANTLR will generate classes which inherit from one of the
184recognizer base classes, providing the implementation of the grammar rules
185itself. this group of classes to implement necessary tasks. Recognizer
186defines methods related to:
187
188* token and character matching
189* prediction and recognition strategy
190* recovering from errors
191* reporting errors
192* memoization
193* simple rule tracing and debugging
194
195=end
196
197class Recognizer
198  include Constants
199  include Error
200  include TokenFactory
201  extend ClassMacros
202  
203  @rules = {}
204  
205  # inherited class methods and hooks
206  class << self
207    attr_reader :grammar_file_name,
208                :antlr_version,
209                :antlr_version_string,
210                :library_version_string,
211                :grammar_home
212    
213    attr_accessor :token_scheme, :default_rule
214    
215    # generated recognizer code uses this method to stamp
216    # the code with the name of the grammar file and
217    # the current version of ANTLR being used to generate
218    # the code
219    def generated_using( grammar_file, antlr_version, library_version = nil )
220      @grammar_file_name = grammar_file.freeze
221      @antlr_version_string = antlr_version.freeze
222      @library_version = Util.parse_version( library_version )
223      if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/
224        @antlr_version = [ $1, $2, $3, $4 ].map! { |str| str.to_i }
225        timestamp = $5.strip
226        #@antlr_release_time = $5.empty? ? nil : Time.parse($5)
227      else
228        raise "bad version string: %p" % version_string
229      end
230    end
231    
232    # this method is used to generate return-value structures for
233    # rules with multiple return values. To avoid generating
234    # a special class for ever rule in AST parsers and such
235    # (where most rules have the same default set of return values),
236    # each recognizer gets a default return value structure
237    # assigned to the constant +Return+. Rules which don't
238    # require additional custom members will have a rule-return
239    # name constant that just points to the generic return
240    # value. 
241    def define_return_scope( *members )
242      if members.empty? then generic_return_scope
243      else
244        members += return_scope_members
245        Struct.new( *members )
246      end
247    end
248    
249    # used as a hook to add additional default members
250    # to default return value structures
251    # For example, all AST-building parsers override
252    # this method to add an extra +:tree+ field to
253    # all rule return structures.
254    def return_scope_members
255      [ :start, :stop ]
256    end
257    
258    # sets up and returns the generic rule return
259    # scope for a recognizer
260    def generic_return_scope
261      @generic_return_scope ||= begin
262        struct = Struct.new( *return_scope_members )
263        const_set( :Return, struct )
264      end
265    end
266    
267    def imported_grammars
268      @imported_grammars ||= Set.new
269    end
270    
271    def master_grammars
272      @master_grammars ||= []
273    end
274    
275    def master
276      master_grammars.last
277    end
278    
279    def masters( *grammar_names )
280      for grammar in grammar_names
281        unless master_grammars.include?( grammar )
282          master_grammars << grammar
283          attr_reader( Util.snake_case( grammar ) )
284        end
285      end
286    end
287    private :masters
288    
289    def imports( *grammar_names )
290      for grammar in grammar_names
291        imported_grammars.add?( grammar.to_sym ) and
292          attr_reader( Util.snake_case( grammar ) )
293      end
294      return imported_grammars
295    end
296    private :imports
297    
298    def rules
299      self::RULE_METHODS.dup rescue []
300    end
301    
302    def default_rule
303      @default_rule ||= rules.first
304    end
305    
306    def debug?
307      return false
308    end
309    
310    def profile?
311      return false
312    end
313    
314    def Scope( *declarations, &body )
315      Scope.new( *declarations, &body )
316    end
317    
318    def token_class
319      @token_class ||= begin
320        self::Token            rescue
321        superclass.token_class rescue
322        ANTLR3::CommonToken
323      end
324    end
325    private :generated_using
326  end
327  
328  @grammar_file_name = nil
329  @antlr_version = ANTLR3::ANTLR_VERSION
330  @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING
331  
332  def grammar_file_name
333    self.class.grammar_file_name
334  end
335  
336  def antlr_version
337    self.class.antlr_version
338  end
339  
340  def antlr_version_string
341    self.class.antlr_version_string
342  end
343  
344  attr_accessor :input
345  attr_reader :state
346  
347  def each_delegate
348    block_given? or return enum_for( __method__ )
349    for grammar in self.class.imported_grammars
350      del = __send__( Util.snake_case( grammar ) ) and
351        yield( del )
352    end
353  end
354  
355  # Create a new recognizer. The constructor simply ensures that
356  # all recognizers are initialized with a shared state object.
357  # See the main recognizer subclasses for more specific
358  # information about creating recognizer objects like
359  # lexers and parsers.
360  def initialize( options = {} )
361    @state  = options[ :state ] || RecognizerSharedState.new
362    @error_output = options.fetch( :error_output, $stderr )
363    defined?( @input ) or @input = nil
364    initialize_dfas
365  end
366  
367  # Resets the recognizer's state data to initial values.
368  # As a result, all error tracking and error recovery
369  # data accumulated in the current state will be cleared.
370  # It will also attempt to reset the input stream
371  # via input.reset, but it ignores any errors received
372  # from doing so. Thus the input stream is not guarenteed
373  # to be rewound to its initial position
374  def reset
375    @state and @state.reset!
376    @input and @input.reset rescue nil
377  end
378  
379  # Attempt to match the current input symbol the token type
380  # specified by +type+. If the symbol matches the type,
381  # consume the current symbol and return its value. If
382  # the symbol doesn't match, attempt to use the follow-set
383  # data provided by +follow+ to recover from the mismatched
384  # token. 
385  def match( type, follow )
386    matched_symbol = current_symbol
387    if @input.peek == type
388      @input.consume
389      @state.error_recovery = false
390      return matched_symbol
391    end
392    raise( BacktrackingFailed ) if @state.backtracking > 0
393    return recover_from_mismatched_token( type, follow )
394  end
395  
396  # match anything -- i.e. wildcard match. Simply consume
397  # the current symbol from the input stream. 
398  def match_any
399    @state.error_recovery = false
400    @input.consume
401  end
402  
403  ##############################################################################################
404  ###################################### Error Reporting #######################################
405  ##############################################################################################
406  ##############################################################################################
407  
408  # When a recognition error occurs, this method is the main
409  # hook for carrying out the error reporting process. The
410  # default implementation calls +display_recognition_error+
411  # to display the error info on $stderr. 
412  def report_error( e = $! )
413    @state.error_recovery and return
414    @state.syntax_errors += 1
415    @state.error_recovery = true
416    display_recognition_error( e )
417  end
418  
419  # error reporting hook for presenting the information
420  # The default implementation builds appropriate error
421  # message text using +error_header+ and +error_message+,
422  # and calls +emit_error_message+ to write the error
423  # message out to some source
424  def display_recognition_error( e = $! )
425    header = error_header( e )
426    message = error_message( e )
427    emit_error_message( "#{ header } #{ message }" )
428  end
429  
430  # used to construct an appropriate error message
431  # based on the specific type of error and the
432  # error's attributes
433  def error_message( e = $! )
434    case e
435    when UnwantedToken
436      token_name = token_name( e.expecting )
437      "extraneous input #{ token_error_display( e.unexpected_token ) } expecting #{ token_name }"
438    when MissingToken
439      token_name = token_name( e.expecting )
440      "missing #{ token_name } at #{ token_error_display( e.symbol ) }"
441    when MismatchedToken
442      token_name = token_name( e.expecting )
443      "mismatched input #{ token_error_display( e.symbol ) } expecting #{ token_name }"
444    when MismatchedTreeNode
445      token_name = token_name( e.expecting )
446      "mismatched tree node: #{ e.symbol } expecting #{ token_name }"
447    when NoViableAlternative
448      "no viable alternative at input " << token_error_display( e.symbol )
449    when MismatchedSet
450      "mismatched input %s expecting set %s" %
451        [ token_error_display( e.symbol ), e.expecting.inspect ]
452    when MismatchedNotSet
453      "mismatched input %s expecting set %s" %
454        [ token_error_display( e.symbol ), e.expecting.inspect ]
455    when FailedPredicate
456      "rule %s failed predicate: { %s }?" % [ e.rule_name, e.predicate_text ]
457    else e.message
458    end
459  end
460  
461  # 
462  # used to add a tag to the error message that indicates
463  # the location of the input stream when the error
464  # occurred
465  # 
466  def error_header( e = $! )
467    e.location
468  end
469  
470  # 
471  # formats a token object appropriately for inspection
472  # within an error message
473  # 
474  def token_error_display( token )
475    unless text = token.text || ( token.source_text rescue nil )
476      text =
477        case
478        when token.type == EOF then '<EOF>'
479        when name = token_name( token.type ) rescue nil then "<#{ name }>"
480        when token.respond_to?( :name ) then "<#{ token.name }>"
481        else "<#{ token.type }>"
482        end
483    end
484    return text.inspect
485  end
486  
487  # 
488  # Write the error report data out to some source. By default,
489  # the error message is written to $stderr
490  # 
491  def emit_error_message( message )
492    @error_output.puts( message ) if @error_output
493  end
494  
495  ##############################################################################################
496  ###################################### Error Recovery ########################################
497  ##############################################################################################
498  
499  def recover( error = $! )
500    @state.last_error_index == @input.index and @input.consume
501    @state.last_error_index = @input.index
502    
503    follow_set = compute_error_recovery_set
504    
505    resync { consume_until( follow_set ) }
506  end
507  
508  def resync
509    begin_resync
510    return( yield )
511  ensure
512    end_resync
513  end
514  
515  # overridable hook method that is executed at the start of the
516  # resyncing procedure in recover
517  #
518  # by default, it does nothing
519  def begin_resync
520    # do nothing
521  end
522  
523  # overridable hook method that is after the resyncing procedure has completed
524  #
525  # by default, it does nothing
526  def end_resync
527    # do nothing
528  end
529  
530  # (The following explanation has been lifted directly from the
531  #  source code documentation of the ANTLR Java runtime library)
532  # 
533  # Compute the error recovery set for the current rule.  During
534  # rule invocation, the parser pushes the set of tokens that can
535  # follow that rule reference on the stack; this amounts to
536  # computing FIRST of what follows the rule reference in the
537  # enclosing rule. This local follow set only includes tokens
538  # from within the rule; i.e., the FIRST computation done by
539  # ANTLR stops at the end of a rule.
540  # 
541  # EXAMPLE
542  # 
543  # When you find a "no viable alt exception", the input is not
544  # consistent with any of the alternatives for rule r.  The best
545  # thing to do is to consume tokens until you see something that
546  # can legally follow a call to r *or* any rule that called r.
547  # You don't want the exact set of viable next tokens because the
548  # input might just be missing a token--you might consume the
549  # rest of the input looking for one of the missing tokens.
550  # 
551  # Consider grammar:
552  # 
553  #   a : '[' b ']'
554  #     | '(' b ')'
555  #     ;
556  #   b : c '^' INT ;
557  #   c : ID
558  #     | INT
559  #     ;
560  # 
561  # At each rule invocation, the set of tokens that could follow
562  # that rule is pushed on a stack.  Here are the various "local"
563  # follow sets:
564  # 
565  #   FOLLOW( b1_in_a ) = FIRST( ']' ) = ']'
566  #   FOLLOW( b2_in_a ) = FIRST( ')' ) = ')'
567  #   FOLLOW( c_in_b ) = FIRST( '^' ) = '^'
568  # 
569  # Upon erroneous input "[]", the call chain is
570  # 
571  #   a -> b -> c
572  # 
573  # and, hence, the follow context stack is:
574  # 
575  #   depth  local follow set     after call to rule
576  #     0         \<EOF>                   a (from main( ) )
577  #     1          ']'                     b
578  #     3          '^'                     c
579  # 
580  # Notice that <tt>')'</tt> is not included, because b would have to have
581  # been called from a different context in rule a for ')' to be
582  # included.
583  # 
584  # For error recovery, we cannot consider FOLLOW(c)
585  # (context-sensitive or otherwise).  We need the combined set of
586  # all context-sensitive FOLLOW sets--the set of all tokens that
587  # could follow any reference in the call chain.  We need to
588  # resync to one of those tokens.  Note that FOLLOW(c)='^' and if
589  # we resync'd to that token, we'd consume until EOF.  We need to
590  # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
591  # In this case, for input "[]", LA(1) is in this set so we would
592  # not consume anything and after printing an error rule c would
593  # return normally.  It would not find the required '^' though.
594  # At this point, it gets a mismatched token error and throws an
595  # exception (since LA(1) is not in the viable following token
596  # set).  The rule exception handler tries to recover, but finds
597  # the same recovery set and doesn't consume anything.  Rule b
598  # exits normally returning to rule a.  Now it finds the ']' (and
599  # with the successful match exits errorRecovery mode).
600  # 
601  # So, you cna see that the parser walks up call chain looking
602  # for the token that was a member of the recovery set.
603  # 
604  # Errors are not generated in errorRecovery mode.
605  # 
606  # ANTLR's error recovery mechanism is based upon original ideas:
607  # 
608  # "Algorithms + Data Structures = Programs" by Niklaus Wirth
609  # 
610  # and
611  # 
612  # "A note on error recovery in recursive descent parsers":
613  # http://portal.acm.org/citation.cfm?id=947902.947905
614  # 
615  # Later, Josef Grosch had some good ideas:
616  # 
617  # "Efficient and Comfortable Error Recovery in Recursive Descent
618  # Parsers":
619  # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
620  # 
621  # Like Grosch I implemented local FOLLOW sets that are combined
622  # at run-time upon error to avoid overhead during parsing.
623  def compute_error_recovery_set
624    combine_follows( false )
625  end
626  
627  def recover_from_mismatched_token( type, follow )
628    if mismatch_is_unwanted_token?( type )
629      err = UnwantedToken( type )
630      resync { @input.consume }
631      report_error( err )
632      
633      return @input.consume
634    end
635    
636    if mismatch_is_missing_token?( follow )
637      inserted = missing_symbol( nil, type, follow )
638      report_error( MissingToken( type, inserted ) )
639      return inserted
640    end
641    
642    raise MismatchedToken( type )
643  end
644  
645  def recover_from_mismatched_set( e, follow )
646    if mismatch_is_missing_token?( follow )
647      report_error( e )
648      return missing_symbol( e, INVALID_TOKEN_TYPE, follow )
649    end
650    raise e
651  end
652  
653  def recover_from_mismatched_element( e, follow )
654    follow.nil? and return false
655    if follow.include?( EOR_TOKEN_TYPE )
656      viable_tokens = compute_context_sensitive_rule_follow
657      follow = ( follow | viable_tokens ) - Set[ EOR_TOKEN_TYPE ]
658    end
659    if follow.include?( @input.peek )
660      report_error( e )
661      return true
662    end
663    return false
664  end
665  
666  # Conjure up a missing token during error recovery.
667  # 
668  # The recognizer attempts to recover from single missing
669  # symbols. But, actions might refer to that missing symbol.
670  # For example, x=ID {f($x);}. The action clearly assumes
671  # that there has been an identifier matched previously and that
672  # $x points at that token. If that token is missing, but
673  # the next token in the stream is what we want we assume that
674  # this token is missing and we keep going. Because we
675  # have to return some token to replace the missing token,
676  # we have to conjure one up. This method gives the user control
677  # over the tokens returned for missing tokens. Mostly,
678  # you will want to create something special for identifier
679  # tokens. For literals such as '{' and ',', the default
680  # action in the parser or tree parser works. It simply creates
681  # a CommonToken of the appropriate type. The text will be the token.
682  # If you change what tokens must be created by the lexer,
683  # override this method to create the appropriate tokens.
684  def missing_symbol( error, expected_token_type, follow )
685    return nil
686  end
687  
688  def mismatch_is_unwanted_token?( type )
689    @input.peek( 2 ) == type
690  end
691  
692  def mismatch_is_missing_token?( follow )
693    follow.nil? and return false
694    if follow.include?( EOR_TOKEN_TYPE )
695      viable_tokens = compute_context_sensitive_rule_follow
696      follow = follow | viable_tokens
697      
698      follow.delete( EOR_TOKEN_TYPE ) unless @state.following.empty?
699    end
700    if follow.include?( @input.peek ) or follow.include?( EOR_TOKEN_TYPE )
701      return true
702    end
703    return false
704  end
705  
706  def syntax_errors?
707    ( error_count = @state.syntax_errors ) > 0 and return( error_count )
708  end
709  
710  # factor out what to do upon token mismatch so
711  # tree parsers can behave differently.
712  #
713  # * override this method in your parser to do things
714  #	  like bailing out after the first error
715  #	* just raise the exception instead of
716  #	  calling the recovery method.
717  #
718  def number_of_syntax_errors
719    @state.syntax_errors
720  end
721  
722  # 
723  # Compute the context-sensitive +FOLLOW+ set for current rule.
724  # This is set of token types that can follow a specific rule
725  # reference given a specific call chain.  You get the set of
726  # viable tokens that can possibly come next (look depth 1)
727  # given the current call chain.  Contrast this with the
728  # definition of plain FOLLOW for rule r:
729  # 
730  #    FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
731  # 
732  # where x in T* and alpha, beta in V*; T is set of terminals and
733  # V is the set of terminals and nonterminals.  In other words,
734  # FOLLOW(r) is the set of all tokens that can possibly follow
735  # references to r in *any* sentential form (context).  At
736  # runtime, however, we know precisely which context applies as
737  # we have the call chain.  We may compute the exact (rather
738  # than covering superset) set of following tokens.
739  # 
740  # For example, consider grammar:
741  # 
742  #   stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
743  #        | "return" expr '.'
744  #        ;
745  #   expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
746  #   atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
747  #        | '(' expr ')'
748  #        ;
749  # 
750  # The FOLLOW sets are all inclusive whereas context-sensitive
751  # FOLLOW sets are precisely what could follow a rule reference.
752  # For input input "i=(3);", here is the derivation:
753  # 
754  #   stat => ID '=' expr ';'
755  #        => ID '=' atom ('+' atom)* ';'
756  #        => ID '=' '(' expr ')' ('+' atom)* ';'
757  #        => ID '=' '(' atom ')' ('+' atom)* ';'
758  #        => ID '=' '(' INT ')' ('+' atom)* ';'
759  #        => ID '=' '(' INT ')' ';'
760  # 
761  # At the "3" token, you'd have a call chain of
762  # 
763  #   stat -> expr -> atom -> expr -> atom
764  # 
765  # What can follow that specific nested ref to atom?  Exactly ')'
766  # as you can see by looking at the derivation of this specific
767  # input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
768  # 
769  # You want the exact viable token set when recovering from a
770  # token mismatch.  Upon token mismatch, if LA(1) is member of
771  # the viable next token set, then you know there is most likely
772  # a missing token in the input stream.  "Insert" one by just not
773  # throwing an exception.
774  # 
775  def compute_context_sensitive_rule_follow
776    combine_follows true
777  end
778  
779  def combine_follows( exact )
780    follow_set = Set.new
781    @state.following.each_with_index.reverse_each do |local_follow_set, index|
782      follow_set |= local_follow_set
783      if exact
784        if local_follow_set.include?( EOR_TOKEN_TYPE )
785          follow_set.delete( EOR_TOKEN_TYPE ) if index > 0
786        else
787          break
788        end
789      end
790    end
791    return follow_set
792  end
793  
794  # 
795  # Match needs to return the current input symbol, which gets put
796  # into the label for the associated token ref; e.g., x=ID.  Token
797  # and tree parsers need to return different objects. Rather than test
798  # for input stream type or change the IntStream interface, I use
799  # a simple method to ask the recognizer to tell me what the current
800  # input symbol is.
801  # 
802  # This is ignored for lexers.
803  # 
804  def current_symbol
805    @input.look
806  end
807  
808  # 
809  # Consume input symbols until one matches a type within types
810  # 
811  # types can be a single symbol type or a set of symbol types
812  # 
813  def consume_until( types )
814    types.is_a?( Set ) or types = Set[ *types ]
815    type = @input.peek
816    until type == EOF or types.include?( type )
817      @input.consume
818      type = @input.peek
819    end
820    return( type )
821  end
822  
823  # 
824  # Returns true if the recognizer is currently in a decision for which
825  # backtracking has been enabled
826  # 
827  def backtracking?
828    @state.backtracking > 0
829  end
830  
831  def backtracking_level
832    @state.backtracking
833  end
834  
835  def backtracking_level=( n )
836    @state.backtracking = n
837  end
838  
839  def backtrack
840    @state.backtracking += 1
841    start = @input.mark
842    success =
843      begin yield
844      rescue BacktrackingFailed then false
845      else true
846      end
847    return success
848  ensure
849    @input.rewind( start )
850    @state.backtracking -= 1
851  end
852  
853  def syntactic_predicate?( name )
854    backtrack { send name }
855  end
856  
857  alias backtracking backtracking_level
858  alias backtracking= backtracking_level=
859  
860  def rule_memoization( rule, start_index )
861    @state.rule_memory.fetch( rule ) do
862      @state.rule_memory[ rule ] = Hash.new( MEMO_RULE_UNKNOWN )
863    end[ start_index ]
864  end
865  
866  def already_parsed_rule?( rule )
867    stop_index = rule_memoization( rule, @input.index )
868    case stop_index
869    when MEMO_RULE_UNKNOWN then return false
870    when MEMO_RULE_FAILED
871      raise BacktrackingFailed
872    else
873      @input.seek( stop_index + 1 )
874    end
875    return true
876  end
877  
878  def memoize( rule, start_index, success )
879    stop_index = success ? @input.index - 1 : MEMO_RULE_FAILED
880    memo = @state.rule_memory[ rule ] and memo[ start_index ] = stop_index
881  end
882  
883  def trace_in( rule_name, rule_index, input_symbol )
884    @error_output.printf( "--> enter %s on %s", rule_name, input_symbol )
885    @state.backtracking > 0 and @error_output.printf( 
886      " (in backtracking mode: depth = %s)", @state.backtracking
887    )
888    @error_output.print( "\n" )
889  end
890  
891  def trace_out( rule_name, rule_index, input_symbol )
892    @error_output.printf( "<-- exit %s on %s", rule_name, input_symbol )
893    @state.backtracking > 0 and @error_output.printf( 
894      " (in backtracking mode: depth = %s)", @state.backtracking
895    )
896    @error_output.print( "\n" )
897  end
898  
899private
900  
901  def initialize_dfas
902    # do nothing
903  end
904end
905
906
907# constant alias for compatibility with older versions of the
908# runtime library
909BaseRecognizer = Recognizer
910
911=begin rdoc ANTLR3::Lexer
912
913= Lexer
914
915Lexer is the default superclass of all lexers generated by ANTLR. The class
916tailors the core functionality provided by Recognizer to the task of
917matching patterns in the text input and breaking the input into tokens.
918
919== About Lexers
920
921A lexer's job is to take input text and break it up into _tokens_ -- objects
922that encapsulate a piece of text, a type label (such as ID or INTEGER), and the
923position of the text with respect to the input. Thus, a lexer is essentially a
924complicated iterator that steps through an input stream and produces a sequence
925of tokens. Sometimes lexers are enough to carry out a goal on their own, such as
926tasks like source code highlighting and simple code analysis. Usually, however,
927the lexer converts text into tokens for use by a parser, which recognizes larger
928structures within the text.
929
930ANTLR parsers have a variety of entry points specified by parser rules, each of
931which defines the structure of a specific type of sentence in a grammar. Lexers,
932however, are primarily intended to have a single entry point. It looks at the
933characters starting at the current input position, decides if the chunk of text
934matches one of a number of possible token type definitions, wraps the chunk into
935a token with information on its type and location, and advances the input stream
936to the next place.
937
938== ANTLR Lexers and the Lexer API
939
940ANTLR-generated lexers will subclass this class, unless specified otherwise
941within a grammar file. The generated class will provide an implementation of
942each lexer rule as a method of the same name. The subclass will also provide an
943implementation for the abstract method #m_tokens, the purpose of which is to
944multiplex the token type definitions and predict what rule definition to execute
945to fetch a token. The primary method in the lexer API, #next_token, uses
946#m_tokens to fetch the next token and drive the iteration.
947
948If the lexer is preparing tokens for use by an ANTLR generated parser, the lexer
949will generally be used to build a TokenStream object. The following code example
950demonstrates the typical setup for using ANTLR parsers and lexers in Ruby.
951  
952  # in HypotheticalLexer.rb
953  module Hypothetical
954  class Lexer < ANTLR3::Lexer
955    # ...
956    # ANTLR generated code
957    # ...
958  end
959  end
960  
961  # in HypotheticalParser.rb
962  module Hypothetical
963  class Parser < ANTLR3::Parser
964    # ...
965    # more ANTLR generated code
966    # ...
967  end
968  end
969  
970  # to take hypothetical source code and prepare it for parsing,
971  # there is generally a four-step construction process
972  
973  source = "some hypothetical source code"
974  input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp')
975  lexer = Hypothetical::Lexer.new( input )
976  tokens = ANTLR3::CommonTokenStream.new( lexer )
977  parser = Hypothetical::Parser.new( tokens )
978  
979  # if you're using the standard streams, ANTLR3::StringStream and
980  # ANTLR3::CommonTokenStream, you can write the same process 
981  # shown above more succinctly:
982  
983  lexer  = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp')
984  parser = Hypothetical::Parser.new( lexer )
985
986=end
987class Lexer < Recognizer
988  include TokenSource
989  @token_class = CommonToken
990  
991  def self.default_rule
992    @default_rule ||= :token!
993  end
994  
995  def self.main( argv = ARGV, options = {} )
996    if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
997    main = ANTLR3::Main::LexerMain.new( self, options )
998    block_given? ? yield( main ) : main.execute( argv )
999  end
1000  
1001  def self.associated_parser
1002    @associated_parser ||= begin
1003      @grammar_home and @grammar_home::Parser
1004    rescue NameError
1005      grammar_name = @grammar_home.name.split( "::" ).last
1006      begin
1007        require "#{ grammar_name }Parser"
1008        @grammar_home::Parser
1009      rescue LoadError, NameError
1010      end
1011    end
1012  end
1013
1014  def initialize( input, options = {} )
1015    super( options )
1016    @input = cast_input( input, options )
1017  end
1018  
1019  def current_symbol
1020    nil
1021  end
1022  
1023  def next_token
1024    loop do
1025      @state.token = nil
1026      @state.channel = DEFAULT_CHANNEL
1027      @state.token_start_position = @input.index
1028      @state.token_start_column = @input.column
1029      @state.token_start_line = @input.line
1030      @state.text = nil
1031      @input.peek == EOF and return EOF_TOKEN
1032      begin
1033        token!
1034        
1035        case token = @state.token
1036        when nil then return( emit )
1037        when SKIP_TOKEN then next
1038        else
1039          return token
1040        end
1041      rescue NoViableAlternative => re
1042        report_error( re )
1043        recover( re )
1044      rescue Error::RecognitionError => re
1045        report_error( re )
1046      end
1047    end
1048  end
1049  
1050  def skip
1051    @state.token = SKIP_TOKEN
1052  end
1053  
1054  abstract :token!
1055  
1056  def exhaust
1057    self.to_a
1058  end
1059  
1060  def char_stream=( input )
1061    @input = nil
1062    reset()
1063    @input = input
1064  end
1065  
1066  def source_name
1067    @input.source_name
1068  end
1069  
1070  def emit( token = @state.token )
1071    token ||= create_token
1072    @state.token = token
1073    return token
1074  end
1075  
1076  def match( expected )
1077    case expected
1078    when String
1079      expected.each_byte do |char|
1080        unless @input.peek == char
1081          @state.backtracking > 0 and raise BacktrackingFailed
1082          error = MismatchedToken( char )
1083          recover( error )
1084          raise error
1085        end
1086        @input.consume()
1087      end
1088    else # single integer character
1089      unless @input.peek == expected
1090        @state.backtracking > 0 and raise BacktrackingFailed
1091        error = MismatchedToken( expected )
1092        recover( error )
1093        raise error
1094      end
1095      @input.consume
1096    end
1097    return true
1098  end
1099    
1100  def match_any
1101    @input.consume
1102  end
1103  
1104  def match_range( min, max )
1105    char = @input.peek
1106    if char.between?( min, max ) then @input.consume
1107    else
1108      @state.backtracking > 0 and raise BacktrackingFailed
1109      error = MismatchedRange( min.chr, max.chr )
1110      recover( error )
1111      raise( error )
1112    end
1113    return true
1114  end
1115  
1116  def line
1117    @input.line
1118  end
1119  
1120  def column
1121    @input.column
1122  end
1123  
1124  def character_index
1125    @input.index
1126  end
1127  
1128  def text
1129    @state.text and return @state.text
1130    @input.substring( @state.token_start_position, character_index - 1 )
1131  end
1132  
1133  def text=( text )
1134    @state.text = text
1135  end
1136  
1137  def report_error( e )
1138    display_recognition_error( e )
1139  end
1140  
1141  def error_message( e )
1142    char = character_error_display( e.symbol ) rescue nil
1143    case e
1144    when Error::MismatchedToken
1145      expecting = character_error_display( e.expecting )
1146      "mismatched character #{ char }; expecting #{ expecting }"
1147    when Error::NoViableAlternative
1148      "no viable alternative at character #{ char }"
1149    when Error::EarlyExit
1150      "required ( ... )+ loop did not match anything at character #{ char }"
1151    when Error::MismatchedNotSet
1152      "mismatched character %s; expecting set %p" % [ char, e.expecting ]
1153    when Error::MismatchedSet
1154      "mismatched character %s; expecting set %p" % [ char, e.expecting ]
1155    when Error::MismatchedRange
1156      a = character_error_display( e.min )
1157      b = character_error_display( e.max )
1158      "mismatched character %s; expecting set %s..%s" % [ char, a, b ]
1159    else super
1160    end
1161  end
1162  
1163  def character_error_display( char )
1164    case char
1165    when EOF then '<EOF>'
1166    when Integer then char.chr.inspect
1167    else char.inspect
1168    end
1169  end
1170  
1171  def recover( re )
1172    @input.consume
1173  end
1174  
1175  alias input= char_stream=
1176  
1177private
1178  
1179  def cast_input( input, options )
1180    case input
1181    when CharacterStream then input
1182    when ::String then StringStream.new( input, options )
1183    when ::IO, ARGF then FileStream.new( input, options )
1184    else input
1185    end
1186  end
1187  
1188  def trace_in( rule_name, rule_index )
1189    if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1190    else symbol = '<EOF>' end
1191    input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
1192    super( rule_name, rule_index, input_symbol )
1193  end
1194  
1195  def trace_out( rule_name, rule_index )
1196    if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1197    else symbol = '<EOF>' end
1198    input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
1199    super( rule_name, rule_index, input_symbol )
1200  end
1201  
1202  def create_token( &b )
1203    if block_given? then super( &b )
1204    else
1205      super do |t|
1206        t.input = @input
1207        t.type = @state.type
1208        t.channel = @state.channel
1209        t.start = @state.token_start_position
1210        t.stop = @input.index - 1
1211        t.line = @state.token_start_line
1212        t.text = self.text
1213        t.column = @state.token_start_column
1214      end
1215    end
1216  end
1217end
1218
1219
1220=begin rdoc ANTLR3::Parser
1221
1222= Parser
1223
1224Parser is the default base class of ANTLR-generated parser classes. The class
1225tailors the functionality provided by Recognizer to the task of parsing.
1226
1227== About Parsing
1228
1229This is just a lose overview of parsing. For considerably more in-depth coverage
1230of the topic, read the ANTLR documentation or check out the ANTLR website
1231(http://www.antlr.org).
1232
1233A grammar defines the vocabulary and the sentence structure of a language. While
1234a lexer concerns the basic vocabulary symbols of the language, a parser's
1235primary task is to implement the sentence structure.
1236
1237Parsers are set up by providing a stream of tokens, which is usually created by
1238a corresponding lexer. Then, the user requests a specific sentence-structure
1239within the grammar, such as "class_definition" or "xml_node", from the parser.
1240It iterates through the tokens, verifying the syntax of the sentence and
1241performing actions specified by the grammar. It stops when it encounters an
1242error or when it has matched the full sentence according to its defined
1243structure.
1244
1245== ANTLR Parsers and the Parser API
1246
1247Plain ANTLR-generated parsers directly subclass this class, unless specified
1248otherwise within the grammar options. The generated code will provide a method
1249for each parser rule defined in the ANTLR grammar, as well as any other
1250customized member attributes and methods specified in the source grammar.
1251
1252This class does not override much of the functionality in Recognizer, and
1253thus the API closely mirrors Recognizer.
1254
1255=end
1256class Parser < Recognizer
1257  def self.main( argv = ARGV, options = {} )
1258    if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
1259    main = ANTLR3::Main::ParserMain.new( self, options )
1260    block_given? ? yield( main ) : main.execute( argv )
1261  end
1262  
1263  def self.associated_lexer
1264    @associated_lexer ||= begin
1265      @grammar_home and @grammar_home::Lexer
1266    rescue NameError
1267      grammar_name = @grammar_home.name.split( "::" ).last
1268      begin
1269        require "#{ grammar_name }Lexer"
1270        @grammar_home::Lexer
1271      rescue LoadError, NameError
1272      end
1273    end
1274  end
1275  
1276  
1277  def initialize( input, options = {} )
1278    super( options )
1279    @input = nil
1280    reset
1281    @input = cast_input( input, options )
1282  end
1283  
1284  def missing_symbol( error, expected_type, follow )
1285    current = @input.look
1286    current = @input.look( -1 ) if current == ANTLR3::EOF_TOKEN
1287    t =
1288      case
1289      when current && current != ANTLR3::EOF_TOKEN then current.clone
1290      when @input.token_class then @input.token_class.new
1291      else ( create_token rescue CommonToken.new )
1292      end
1293    
1294    t.type = expected_type
1295    name = t.name.gsub( /(^<)|(>$)/,'' )
1296    t.text = "<missing #{ name }>"
1297    t.channel = DEFAULT_CHANNEL
1298    return( t )
1299  end
1300  
1301  def token_stream=( input )
1302    @input = nil
1303    reset
1304    @input = input
1305  end
1306  alias token_stream input
1307  
1308  def source_name
1309    @input.source_name
1310  end
1311  
1312  
1313private
1314  
1315  def trace_in( rule_name, rule_index )
1316    super( rule_name, rule_index, @input.look.inspect )
1317  end
1318  
1319  def trace_out( rule_name, rule_index )
1320    super( rule_name, rule_index, @input.look.inspect )
1321  end
1322  
1323  def cast_input( input, options )
1324    case input
1325    when TokenStream then input
1326    when TokenSource then CommonTokenStream.new( input, options )
1327    when IO, String, CharacterStream
1328      if lexer_class = self.class.associated_lexer
1329        CommonTokenStream.new( lexer_class.new( input, options ), options )
1330      else
1331        raise ArgumentError, Util.tidy( <<-END, true )
1332        | unable to automatically convert input #{ input.inspect }
1333        | to a ANTLR3::TokenStream object as #{ self.class }
1334        | does not appear to have an associated lexer class
1335        END
1336      end
1337    else
1338      # assume it's a stream if it at least implements peek and consume
1339      unless input.respond_to?( :peek ) and input.respond_to?( :consume )
1340        raise ArgumentError, Util.tidy( <<-END, true )
1341        | #{ self.class } requires a token stream as input, but
1342        | #{ input.inspect } was provided
1343        END
1344      end
1345      input
1346    end
1347  end
1348  
1349end
1350
1351end
1352