1#!/usr/bin/ruby
2# encoding: utf-8
3
4=begin LICENSE
5
6[The "BSD licence"]
7Copyright (c) 2009-2010 Kyle Yetter
8All rights reserved.
9
10Redistribution and use in source and binary forms, with or without
11modification, are permitted provided that the following conditions
12are met:
13
14 1. Redistributions of source code must retain the above copyright
15    notice, this list of conditions and the following disclaimer.
16 2. Redistributions in binary form must reproduce the above copyright
17    notice, this list of conditions and the following disclaimer in the
18    documentation and/or other materials provided with the distribution.
19 3. The name of the author may not be used to endorse or promote products
20    derived from this software without specific prior written permission.
21
22THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33=end
34
35module ANTLR3
36
37=begin rdoc ANTLR3::Token
38
39At a minimum, tokens are data structures that bind together a chunk of text and
40a corresponding type symbol, which categorizes/characterizes the content of the
41text. Tokens also usually carry information about their location in the input,
42such as absolute character index, line number, and position within the line (or
43column).
44
45Furthermore, ANTLR tokens are assigned a "channel" number, an extra degree of
46categorization that groups things on a larger scale. Parsers will usually ignore
47tokens that have channel value 99 (the HIDDEN_CHANNEL), so you can keep things
48like comment and white space huddled together with neighboring tokens,
49effectively ignoring them without discarding them.
50
51ANTLR tokens also keep a reference to the source stream from which they
52originated. Token streams will also provide an index value for the token, which
53indicates the position of the token relative to other tokens in the stream,
54starting at zero. For example, the 22nd token pulled from a lexer by
55CommonTokenStream will have index value 21.
56
57== Token as an Interface
58
59This library provides a token implementation (see CommonToken). Additionally,
60you may write your own token class as long as you provide methods that give
61access to the attributes expected by a token. Even though most of the ANTLR
62library tries to use duck-typing techniques instead of pure object-oriented type
63checking, it's a good idea to include this ANTLR3::Token into your customized
64token class.
65
66=end
67
68module Token
69  include ANTLR3::Constants
70  include Comparable
71  
72  # the token's associated chunk of text
73  attr_accessor :text
74  
75  # the integer value associated with the token's type
76  attr_accessor :type
77  
78  # the text's starting line number within the source (indexed starting at 1)
79  attr_accessor :line
80  
81  # the text's starting position in the line within the source (indexed starting at 0)
82  attr_accessor :column
83  
84  # the integer value of the channel to which the token is assigned
85  attr_accessor :channel
86  
87  # the index of the token with respect to other the other tokens produced during lexing
88  attr_accessor :index
89  
90  # a reference to the input stream from which the token was extracted
91  attr_accessor :input
92  
93  # the absolute character index in the input at which the text starts
94  attr_accessor :start
95  
96  # the absolute character index in the input at which the text ends
97  attr_accessor :stop
98
99  alias :input_stream :input
100  alias :input_stream= :input=
101  alias :token_index :index
102  alias :token_index= :index=
103  
104  #
105  # The match operator has been implemented to match against several different
106  # attributes of a token for convenience in quick scripts
107  #
108  # @example Match against an integer token type constant
109  #   token =~ VARIABLE_NAME   => true/false
110  # @example Match against a token type name as a Symbol
111  #   token =~ :FLOAT          => true/false
112  # @example Match the token text against a Regular Expression
113  #   token =~ /^@[a-z_]\w*$/i
114  # @example Compare the token's text to a string
115  #   token =~ "class"
116  # 
117  def =~ obj
118    case obj
119    when Integer then type == obj
120    when Symbol then name == obj.to_s
121    when Regexp then obj =~ text
122    when String then text == obj
123    else super
124    end
125  end
126  
127  #
128  # Tokens are comparable by their stream index values
129  # 
130  def <=> tk2
131    index <=> tk2.index
132  end
133  
134  def initialize_copy( orig )
135    self.index   = -1
136    self.type    = orig.type
137    self.channel = orig.channel
138    self.text    = orig.text.clone if orig.text
139    self.start   = orig.start
140    self.stop    = orig.stop
141    self.line    = orig.line
142    self.column  = orig.column
143    self.input   = orig.input
144  end
145  
146  def concrete?
147    input && start && stop ? true : false
148  end
149  
150  def imaginary?
151    input && start && stop ? false : true
152  end
153  
154  def name
155    token_name( type )
156  end
157  
158  def source_name
159    i = input and i.source_name
160  end
161  
162  def hidden?
163    channel == HIDDEN_CHANNEL
164  end
165  
166  def source_text
167    concrete? ? input.substring( start, stop ) : text
168  end
169  
170  #
171  # Sets the token's channel value to HIDDEN_CHANNEL
172  # 
173  def hide!
174    self.channel = HIDDEN_CHANNEL
175  end
176  
177  def inspect
178    text_inspect    = text  ? "[#{ text.inspect }] " : ' '
179    text_position   = line > 0  ? "@ line #{ line } col #{ column } " : ''
180    stream_position = start ? "(#{ range.inspect })" : ''
181    
182    front =  index >= 0 ? "#{ index } " : ''
183    rep = front << name << text_inspect <<
184                text_position << stream_position
185    rep.strip!
186    channel == DEFAULT_CHANNEL or rep << " (#{ channel.to_s })"
187    return( rep )
188  end
189  
190  def pretty_print( printer )
191    printer.text( inspect )
192  end
193  
194  def range
195    start..stop rescue nil
196  end
197  
198  def to_i
199    index.to_i
200  end
201  
202  def to_s
203    text.to_s
204  end
205  
206private
207  
208  def token_name( type )
209    BUILT_IN_TOKEN_NAMES[ type ]
210  end
211end
212
213CommonToken = Struct.new( :type, :channel, :text, :input, :start,
214                         :stop, :index, :line, :column )
215
216=begin rdoc ANTLR3::CommonToken
217
218The base class for the standard implementation of Token. It is implemented as a
219simple Struct as tokens are basically simple data structures binding together a
220bunch of different information and Structs are slightly faster than a standard
221Object with accessor methods implementation.
222
223By default, ANTLR generated ruby code will provide a customized subclass of
224CommonToken to track token-type names efficiently for debugging, inspection, and
225general utility. Thus code generated for a standard combo lexer-parser grammar
226named XYZ will have a base module named XYZ and a customized CommonToken
227subclass named XYZ::Token.
228
229Here is the token structure attribute list in order:
230
231* <tt>type</tt>
232* <tt>channel</tt>
233* <tt>text</tt>
234* <tt>input</tt>
235* <tt>start</tt>
236* <tt>stop</tt>
237* <tt>index</tt>
238* <tt>line</tt>
239* <tt>column</tt>
240
241=end
242
243class CommonToken
244  include Token
245  DEFAULT_VALUES = { 
246    :channel => DEFAULT_CHANNEL,
247    :index   => -1,
248    :line    =>  0,
249    :column  => -1
250  }.freeze
251  
252  def self.token_name( type )
253    BUILT_IN_TOKEN_NAMES[ type ]
254  end
255  
256  def self.create( fields = {} )
257    fields = DEFAULT_VALUES.merge( fields )
258    args = members.map { |name| fields[ name.to_sym ] }
259    new( *args )
260  end
261  
262  # allows you to make a copy of a token with a different class
263  def self.from_token( token )
264    new( 
265      token.type,  token.channel, token.text ? token.text.clone : nil,
266      token.input, token.start,   token.stop, -1, token.line, token.column
267    )
268  end
269  
270  def initialize( type = nil, channel = DEFAULT_CHANNEL, text = nil,
271                 input = nil, start = nil, stop = nil, index = -1,
272                 line = 0, column = -1 )
273    super
274    block_given? and yield( self )
275    self.text.nil? && self.start && self.stop and
276      self.text = self.input.substring( self.start, self.stop )
277  end
278  
279  alias :input_stream :input
280  alias :input_stream= :input=
281  alias :token_index :index
282  alias :token_index= :index=
283end
284
285module Constants
286  
287  # End of File / End of Input character and token type
288  EOF_TOKEN = CommonToken.new( EOF ).freeze
289  INVALID_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze
290  SKIP_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze  
291end
292
293
294
295=begin rdoc ANTLR3::TokenSource
296
297TokenSource is a simple mixin module that demands an
298implementation of the method #next_token. In return, it
299defines methods #next and #each, which provide basic
300iterator methods for token generators. Furthermore, it
301includes Enumerable to provide the standard Ruby iteration
302methods to token generators, like lexers.
303
304=end
305
306module TokenSource
307  include Constants
308  include Enumerable
309  extend ClassMacros
310  
311  abstract :next_token
312  
313  def next
314    token = next_token()
315    raise StopIteration if token.nil? || token.type == EOF
316    return token
317  end
318  
319  def each
320    block_given? or return enum_for( :each )
321    while token = next_token and token.type != EOF
322      yield( token )
323    end
324    return self
325  end
326
327  def to_stream( options = {} )
328    if block_given?
329      CommonTokenStream.new( self, options ) { | t, stream | yield( t, stream ) }
330    else
331      CommonTokenStream.new( self, options )
332    end
333  end
334end
335
336
337=begin rdoc ANTLR3::TokenFactory
338
339There are a variety of different entities throughout the ANTLR runtime library
340that need to create token objects This module serves as a mixin that provides
341methods for constructing tokens.
342
343Including this module provides a +token_class+ attribute. Instance of the
344including class can create tokens using the token class (which defaults to
345ANTLR3::CommonToken). Token classes are presumed to have an #initialize method
346that can be called without any parameters and the token objects are expected to
347have the standard token attributes (see ANTLR3::Token).
348
349=end
350
351module TokenFactory
352  attr_writer :token_class
353  def token_class
354    @token_class ||= begin
355      self.class.token_class rescue
356      self::Token rescue
357      ANTLR3::CommonToken
358    end
359  end
360  
361  def create_token( *args )
362    if block_given?
363      token_class.new( *args ) do |*targs|
364        yield( *targs )
365      end
366    else
367      token_class.new( *args )
368    end
369  end
370end
371
372
373=begin rdoc ANTLR3::TokenScheme
374
375TokenSchemes exist to handle the problem of defining token types as integer
376values while maintaining meaningful text names for the types. They are
377dynamically defined modules that map integer values to constants with token-type
378names.
379
380---
381
382Fundamentally, tokens exist to take a chunk of text and identify it as belonging
383to some category, like "VARIABLE" or "INTEGER". In code, the category is
384represented by an integer -- some arbitrary value that ANTLR will decide to use
385as it is creating the recognizer. The purpose of using an integer (instead of
386say, a ruby symbol) is that ANTLR's decision logic often needs to test whether a
387token's type falls within a range, which is not possible with symbols.
388
389The downside of token types being represented as integers is that a developer
390needs to be able to reference the unknown type value by name in action code.
391Furthermore, code that references the type by name and tokens that can be
392inspected with names in place of type values are more meaningful to a developer.
393
394Since ANTLR requires token type names to follow capital-letter naming
395conventions, defining types as named constants of the recognizer class resolves
396the problem of referencing type values by name. Thus, a token type like
397``VARIABLE'' can be represented by a number like 5 and referenced within code by
398+VARIABLE+. However, when a recognizer creates tokens, the name of the token's
399type cannot be seen without using the data defined in the recognizer.
400
401Of course, tokens could be defined with a name attribute that could be specified
402when tokens are created. However, doing so would make tokens take up more space
403than necessary, as well as making it difficult to change the type of a token
404while maintaining a correct name value.
405
406TokenSchemes exist as a technique to manage token type referencing and name
407extraction. They:
408
4091. keep token type references clear and understandable in recognizer code
4102. permit access to a token's type-name independently of recognizer objects
4113. allow multiple classes to share the same token information
412
413== Building Token Schemes
414
415TokenScheme is a subclass of Module. Thus, it has the method
416<tt>TokenScheme.new(tk_class = nil) { ... module-level code ...}</tt>, which
417will evaluate the block in the context of the scheme (module), similarly to
418Module#module_eval. Before evaluating the block, <tt>.new</tt> will setup the
419module with the following actions:
420
4211. define a customized token class (more on that below)
4222. add a new constant, TOKEN_NAMES, which is a hash that maps types to names
4233. dynamically populate the new scheme module with a couple instance methods
4244. include ANTLR3::Constants in the new scheme module
425
426As TokenScheme the class functions as a metaclass, figuring out some of the
427scoping behavior can be mildly confusing if you're trying to get a handle of the
428entity for your own purposes. Remember that all of the instance methods of
429TokenScheme function as module-level methods of TokenScheme instances, ala
430+attr_accessor+ and friends.
431
432<tt>TokenScheme#define_token(name_symbol, int_value)</tt> adds a constant
433definition <tt>name_symbol</tt> with the value <tt>int_value</tt>. It is
434essentially like <tt>Module#const_set</tt>, except it forbids constant
435overwriting (which would mess up recognizer code fairly badly) and adds an
436inverse type-to-name map to its own <tt>TOKEN_NAMES</tt> table.
437<tt>TokenScheme#define_tokens</tt> is a convenience method for defining many
438types with a hash pairing names to values.
439
440<tt>TokenScheme#register_name(value, name_string)</tt> specifies a custom
441type-to-name definition. This is particularly useful for the anonymous tokens
442that ANTLR generates for literal strings in the grammar specification. For
443example, if you refer to the literal <tt>'='</tt> in some parser rule in your
444grammar, ANTLR will add a lexer rule for the literal and give the token a name
445like <tt>T__<i>x</i></tt>, where <tt><i>x</i></tt> is the type's integer value.
446Since this is pretty meaningless to a developer, generated code should add a
447special name definition for type value <tt><i>x</i></tt> with the string
448<tt>"'='"</tt>.
449
450=== Sample TokenScheme Construction
451
452  TokenData = ANTLR3::TokenScheme.new do
453    define_tokens(
454      :INT  => 4,
455      :ID   => 6,
456      :T__5 => 5,
457      :WS   => 7
458    )
459    
460    # note the self:: scoping below is due to the fact that
461    # ruby lexically-scopes constant names instead of
462    # looking up in the current scope
463    register_name(self::T__5, "'='")
464  end
465  
466  TokenData::ID           # => 6
467  TokenData::T__5         # => 5
468  TokenData.token_name(4) # => 'INT'
469  TokenData.token_name(5) # => "'='"
470  
471  class ARecognizerOrSuch < ANTLR3::Parser
472    include TokenData
473    ID   # => 6
474  end
475
476== Custom Token Classes and Relationship with Tokens
477
478When a TokenScheme is created, it will define a subclass of ANTLR3::CommonToken
479and assigned it to the constant name +Token+. This token class will both include
480and extend the scheme module. Since token schemes define the private instance
481method <tt>token_name(type)</tt>, instances of the token class are now able to
482provide their type names. The Token method <tt>name</tt> uses the
483<tt>token_name</tt> method to provide the type name as if it were a simple
484attribute without storing the name itself.
485
486When a TokenScheme is included in a recognizer class, the class will now have
487the token types as named constants, a type-to-name map constant +TOKEN_NAMES+,
488and a grammar-specific subclass of ANTLR3::CommonToken assigned to the constant
489Token. Thus, when recognizers need to manufacture tokens, instead of using the
490generic CommonToken class, they can create tokens using the customized Token
491class provided by the token scheme.
492
493If you need to use a token class other than CommonToken, you can pass the class
494as a parameter to TokenScheme.new, which will be used in place of the
495dynamically-created CommonToken subclass.
496
497=end
498
499class TokenScheme < ::Module
500  include TokenFactory
501  
502  def self.new( tk_class = nil, &body )
503    super() do
504      tk_class ||= Class.new( ::ANTLR3::CommonToken )
505      self.token_class = tk_class
506      
507      const_set( :TOKEN_NAMES, ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.clone )
508      
509      @types  = ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.invert
510      @unused = ::ANTLR3::Constants::MIN_TOKEN_TYPE
511      
512      scheme = self
513      define_method( :token_scheme ) { scheme }
514      define_method( :token_names )  { scheme::TOKEN_NAMES }
515      define_method( :token_name ) do |type|
516        begin
517          token_names[ type ] or super
518        rescue NoMethodError
519          ::ANTLR3::CommonToken.token_name( type )
520        end
521      end
522      module_function :token_name, :token_names
523      
524      include ANTLR3::Constants
525      
526      body and module_eval( &body )
527    end
528  end
529  
530  def self.build( *token_names )
531    token_names = [ token_names ].flatten!
532    token_names.compact!
533    token_names.uniq!
534    tk_class = Class === token_names.first ? token_names.shift : nil
535    value_maps, names = token_names.partition { |i| Hash === i }
536    new( tk_class ) do
537      for value_map in value_maps
538        define_tokens( value_map )
539      end
540      
541      for name in names
542        define_token( name )
543      end
544    end
545  end
546  
547  
548  def included( mod )
549    super
550    mod.extend( self )
551  end
552  private :included
553  
554  attr_reader :unused, :types
555  
556  def define_tokens( token_map = {} )
557    for token_name, token_value in token_map
558      define_token( token_name, token_value )
559    end
560    return self
561  end
562  
563  def define_token( name, value = nil )
564    name = name.to_s
565    
566    if current_value = @types[ name ]
567      # token type has already been defined
568      # raise an error unless value is the same as the current value
569      value ||= current_value
570      unless current_value == value
571        raise NameError.new( 
572          "new token type definition ``#{ name } = #{ value }'' conflicts " <<
573          "with existing type definition ``#{ name } = #{ current_value }''", name
574        )
575      end
576    else
577      value ||= @unused
578      if name =~ /^[A-Z]\w*$/
579        const_set( name, @types[ name ] = value )
580      else
581        constant = "T__#{ value }"
582        const_set( constant, @types[ constant ] = value )
583        @types[ name ] = value
584      end
585      register_name( value, name ) unless built_in_type?( value )
586    end
587    
588    value >= @unused and @unused = value + 1
589    return self
590  end
591  
592  def register_names( *names )
593    if names.length == 1 and Hash === names.first
594      names.first.each do |value, name|
595        register_name( value, name )
596      end
597    else
598      names.each_with_index do |name, i|
599        type_value = Constants::MIN_TOKEN_TYPE + i
600        register_name( type_value, name )
601      end
602    end
603  end
604  
605  def register_name( type_value, name )
606    name = name.to_s.freeze
607    if token_names.has_key?( type_value )
608      current_name = token_names[ type_value ]
609      current_name == name and return name
610      
611      if current_name == "T__#{ type_value }"
612        # only an anonymous name is registered -- upgrade the name to the full literal name
613        token_names[ type_value ] = name
614      elsif name == "T__#{ type_value }"
615        # ignore name downgrade from literal to anonymous constant
616        return current_name
617      else
618        error = NameError.new( 
619          "attempted assignment of token type #{ type_value }" <<
620          " to name #{ name } conflicts with existing name #{ current_name }", name
621        )
622        raise error
623      end
624    else
625      token_names[ type_value ] = name.to_s.freeze
626    end
627  end
628  
629  def built_in_type?( type_value )
630    Constants::BUILT_IN_TOKEN_NAMES.fetch( type_value, false ) and true
631  end
632  
633  def token_defined?( name_or_value )
634    case value
635    when Integer then token_names.has_key?( name_or_value )
636    else const_defined?( name_or_value.to_s )
637    end
638  end
639  
640  def []( name_or_value )
641    case name_or_value
642    when Integer then token_names.fetch( name_or_value, nil )
643    else const_get( name_or_value.to_s ) rescue token_names.index( name_or_value )
644    end
645  end
646  
647  def token_class
648    self::Token
649  end
650  
651  def token_class=( klass )
652    Class === klass or raise( TypeError, "token_class must be a Class" )
653    Util.silence_warnings do
654      klass < self or klass.send( :include, self )
655      const_set( :Token, klass )
656    end
657  end
658  
659end
660
661end
662