1#!/usr/bin/ruby 2# encoding: utf-8 3 4=begin LICENSE 5 6[The "BSD licence"] 7Copyright (c) 2009-2010 Kyle Yetter 8All rights reserved. 9 10Redistribution and use in source and binary forms, with or without 11modification, are permitted provided that the following conditions 12are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in the 18 documentation and/or other materials provided with the distribution. 19 3. The name of the author may not be used to endorse or promote products 20 derived from this software without specific prior written permission. 21 22THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33=end 34 35module ANTLR3 36 37=begin rdoc ANTLR3::Token 38 39At a minimum, tokens are data structures that bind together a chunk of text and 40a corresponding type symbol, which categorizes/characterizes the content of the 41text. Tokens also usually carry information about their location in the input, 42such as absolute character index, line number, and position within the line (or 43column). 44 45Furthermore, ANTLR tokens are assigned a "channel" number, an extra degree of 46categorization that groups things on a larger scale. Parsers will usually ignore 47tokens that have channel value 99 (the HIDDEN_CHANNEL), so you can keep things 48like comment and white space huddled together with neighboring tokens, 49effectively ignoring them without discarding them. 50 51ANTLR tokens also keep a reference to the source stream from which they 52originated. Token streams will also provide an index value for the token, which 53indicates the position of the token relative to other tokens in the stream, 54starting at zero. For example, the 22nd token pulled from a lexer by 55CommonTokenStream will have index value 21. 56 57== Token as an Interface 58 59This library provides a token implementation (see CommonToken). Additionally, 60you may write your own token class as long as you provide methods that give 61access to the attributes expected by a token. Even though most of the ANTLR 62library tries to use duck-typing techniques instead of pure object-oriented type 63checking, it's a good idea to include this ANTLR3::Token into your customized 64token class. 65 66=end 67 68module Token 69 include ANTLR3::Constants 70 include Comparable 71 72 # the token's associated chunk of text 73 attr_accessor :text 74 75 # the integer value associated with the token's type 76 attr_accessor :type 77 78 # the text's starting line number within the source (indexed starting at 1) 79 attr_accessor :line 80 81 # the text's starting position in the line within the source (indexed starting at 0) 82 attr_accessor :column 83 84 # the integer value of the channel to which the token is assigned 85 attr_accessor :channel 86 87 # the index of the token with respect to other the other tokens produced during lexing 88 attr_accessor :index 89 90 # a reference to the input stream from which the token was extracted 91 attr_accessor :input 92 93 # the absolute character index in the input at which the text starts 94 attr_accessor :start 95 96 # the absolute character index in the input at which the text ends 97 attr_accessor :stop 98 99 alias :input_stream :input 100 alias :input_stream= :input= 101 alias :token_index :index 102 alias :token_index= :index= 103 104 # 105 # The match operator has been implemented to match against several different 106 # attributes of a token for convenience in quick scripts 107 # 108 # @example Match against an integer token type constant 109 # token =~ VARIABLE_NAME => true/false 110 # @example Match against a token type name as a Symbol 111 # token =~ :FLOAT => true/false 112 # @example Match the token text against a Regular Expression 113 # token =~ /^@[a-z_]\w*$/i 114 # @example Compare the token's text to a string 115 # token =~ "class" 116 # 117 def =~ obj 118 case obj 119 when Integer then type == obj 120 when Symbol then name == obj.to_s 121 when Regexp then obj =~ text 122 when String then text == obj 123 else super 124 end 125 end 126 127 # 128 # Tokens are comparable by their stream index values 129 # 130 def <=> tk2 131 index <=> tk2.index 132 end 133 134 def initialize_copy( orig ) 135 self.index = -1 136 self.type = orig.type 137 self.channel = orig.channel 138 self.text = orig.text.clone if orig.text 139 self.start = orig.start 140 self.stop = orig.stop 141 self.line = orig.line 142 self.column = orig.column 143 self.input = orig.input 144 end 145 146 def concrete? 147 input && start && stop ? true : false 148 end 149 150 def imaginary? 151 input && start && stop ? false : true 152 end 153 154 def name 155 token_name( type ) 156 end 157 158 def source_name 159 i = input and i.source_name 160 end 161 162 def hidden? 163 channel == HIDDEN_CHANNEL 164 end 165 166 def source_text 167 concrete? ? input.substring( start, stop ) : text 168 end 169 170 # 171 # Sets the token's channel value to HIDDEN_CHANNEL 172 # 173 def hide! 174 self.channel = HIDDEN_CHANNEL 175 end 176 177 def inspect 178 text_inspect = text ? "[#{ text.inspect }] " : ' ' 179 text_position = line > 0 ? "@ line #{ line } col #{ column } " : '' 180 stream_position = start ? "(#{ range.inspect })" : '' 181 182 front = index >= 0 ? "#{ index } " : '' 183 rep = front << name << text_inspect << 184 text_position << stream_position 185 rep.strip! 186 channel == DEFAULT_CHANNEL or rep << " (#{ channel.to_s })" 187 return( rep ) 188 end 189 190 def pretty_print( printer ) 191 printer.text( inspect ) 192 end 193 194 def range 195 start..stop rescue nil 196 end 197 198 def to_i 199 index.to_i 200 end 201 202 def to_s 203 text.to_s 204 end 205 206private 207 208 def token_name( type ) 209 BUILT_IN_TOKEN_NAMES[ type ] 210 end 211end 212 213CommonToken = Struct.new( :type, :channel, :text, :input, :start, 214 :stop, :index, :line, :column ) 215 216=begin rdoc ANTLR3::CommonToken 217 218The base class for the standard implementation of Token. It is implemented as a 219simple Struct as tokens are basically simple data structures binding together a 220bunch of different information and Structs are slightly faster than a standard 221Object with accessor methods implementation. 222 223By default, ANTLR generated ruby code will provide a customized subclass of 224CommonToken to track token-type names efficiently for debugging, inspection, and 225general utility. Thus code generated for a standard combo lexer-parser grammar 226named XYZ will have a base module named XYZ and a customized CommonToken 227subclass named XYZ::Token. 228 229Here is the token structure attribute list in order: 230 231* <tt>type</tt> 232* <tt>channel</tt> 233* <tt>text</tt> 234* <tt>input</tt> 235* <tt>start</tt> 236* <tt>stop</tt> 237* <tt>index</tt> 238* <tt>line</tt> 239* <tt>column</tt> 240 241=end 242 243class CommonToken 244 include Token 245 DEFAULT_VALUES = { 246 :channel => DEFAULT_CHANNEL, 247 :index => -1, 248 :line => 0, 249 :column => -1 250 }.freeze 251 252 def self.token_name( type ) 253 BUILT_IN_TOKEN_NAMES[ type ] 254 end 255 256 def self.create( fields = {} ) 257 fields = DEFAULT_VALUES.merge( fields ) 258 args = members.map { |name| fields[ name.to_sym ] } 259 new( *args ) 260 end 261 262 # allows you to make a copy of a token with a different class 263 def self.from_token( token ) 264 new( 265 token.type, token.channel, token.text ? token.text.clone : nil, 266 token.input, token.start, token.stop, -1, token.line, token.column 267 ) 268 end 269 270 def initialize( type = nil, channel = DEFAULT_CHANNEL, text = nil, 271 input = nil, start = nil, stop = nil, index = -1, 272 line = 0, column = -1 ) 273 super 274 block_given? and yield( self ) 275 self.text.nil? && self.start && self.stop and 276 self.text = self.input.substring( self.start, self.stop ) 277 end 278 279 alias :input_stream :input 280 alias :input_stream= :input= 281 alias :token_index :index 282 alias :token_index= :index= 283end 284 285module Constants 286 287 # End of File / End of Input character and token type 288 EOF_TOKEN = CommonToken.new( EOF ).freeze 289 INVALID_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze 290 SKIP_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze 291end 292 293 294 295=begin rdoc ANTLR3::TokenSource 296 297TokenSource is a simple mixin module that demands an 298implementation of the method #next_token. In return, it 299defines methods #next and #each, which provide basic 300iterator methods for token generators. Furthermore, it 301includes Enumerable to provide the standard Ruby iteration 302methods to token generators, like lexers. 303 304=end 305 306module TokenSource 307 include Constants 308 include Enumerable 309 extend ClassMacros 310 311 abstract :next_token 312 313 def next 314 token = next_token() 315 raise StopIteration if token.nil? || token.type == EOF 316 return token 317 end 318 319 def each 320 block_given? or return enum_for( :each ) 321 while token = next_token and token.type != EOF 322 yield( token ) 323 end 324 return self 325 end 326 327 def to_stream( options = {} ) 328 if block_given? 329 CommonTokenStream.new( self, options ) { | t, stream | yield( t, stream ) } 330 else 331 CommonTokenStream.new( self, options ) 332 end 333 end 334end 335 336 337=begin rdoc ANTLR3::TokenFactory 338 339There are a variety of different entities throughout the ANTLR runtime library 340that need to create token objects This module serves as a mixin that provides 341methods for constructing tokens. 342 343Including this module provides a +token_class+ attribute. Instance of the 344including class can create tokens using the token class (which defaults to 345ANTLR3::CommonToken). Token classes are presumed to have an #initialize method 346that can be called without any parameters and the token objects are expected to 347have the standard token attributes (see ANTLR3::Token). 348 349=end 350 351module TokenFactory 352 attr_writer :token_class 353 def token_class 354 @token_class ||= begin 355 self.class.token_class rescue 356 self::Token rescue 357 ANTLR3::CommonToken 358 end 359 end 360 361 def create_token( *args ) 362 if block_given? 363 token_class.new( *args ) do |*targs| 364 yield( *targs ) 365 end 366 else 367 token_class.new( *args ) 368 end 369 end 370end 371 372 373=begin rdoc ANTLR3::TokenScheme 374 375TokenSchemes exist to handle the problem of defining token types as integer 376values while maintaining meaningful text names for the types. They are 377dynamically defined modules that map integer values to constants with token-type 378names. 379 380--- 381 382Fundamentally, tokens exist to take a chunk of text and identify it as belonging 383to some category, like "VARIABLE" or "INTEGER". In code, the category is 384represented by an integer -- some arbitrary value that ANTLR will decide to use 385as it is creating the recognizer. The purpose of using an integer (instead of 386say, a ruby symbol) is that ANTLR's decision logic often needs to test whether a 387token's type falls within a range, which is not possible with symbols. 388 389The downside of token types being represented as integers is that a developer 390needs to be able to reference the unknown type value by name in action code. 391Furthermore, code that references the type by name and tokens that can be 392inspected with names in place of type values are more meaningful to a developer. 393 394Since ANTLR requires token type names to follow capital-letter naming 395conventions, defining types as named constants of the recognizer class resolves 396the problem of referencing type values by name. Thus, a token type like 397``VARIABLE'' can be represented by a number like 5 and referenced within code by 398+VARIABLE+. However, when a recognizer creates tokens, the name of the token's 399type cannot be seen without using the data defined in the recognizer. 400 401Of course, tokens could be defined with a name attribute that could be specified 402when tokens are created. However, doing so would make tokens take up more space 403than necessary, as well as making it difficult to change the type of a token 404while maintaining a correct name value. 405 406TokenSchemes exist as a technique to manage token type referencing and name 407extraction. They: 408 4091. keep token type references clear and understandable in recognizer code 4102. permit access to a token's type-name independently of recognizer objects 4113. allow multiple classes to share the same token information 412 413== Building Token Schemes 414 415TokenScheme is a subclass of Module. Thus, it has the method 416<tt>TokenScheme.new(tk_class = nil) { ... module-level code ...}</tt>, which 417will evaluate the block in the context of the scheme (module), similarly to 418Module#module_eval. Before evaluating the block, <tt>.new</tt> will setup the 419module with the following actions: 420 4211. define a customized token class (more on that below) 4222. add a new constant, TOKEN_NAMES, which is a hash that maps types to names 4233. dynamically populate the new scheme module with a couple instance methods 4244. include ANTLR3::Constants in the new scheme module 425 426As TokenScheme the class functions as a metaclass, figuring out some of the 427scoping behavior can be mildly confusing if you're trying to get a handle of the 428entity for your own purposes. Remember that all of the instance methods of 429TokenScheme function as module-level methods of TokenScheme instances, ala 430+attr_accessor+ and friends. 431 432<tt>TokenScheme#define_token(name_symbol, int_value)</tt> adds a constant 433definition <tt>name_symbol</tt> with the value <tt>int_value</tt>. It is 434essentially like <tt>Module#const_set</tt>, except it forbids constant 435overwriting (which would mess up recognizer code fairly badly) and adds an 436inverse type-to-name map to its own <tt>TOKEN_NAMES</tt> table. 437<tt>TokenScheme#define_tokens</tt> is a convenience method for defining many 438types with a hash pairing names to values. 439 440<tt>TokenScheme#register_name(value, name_string)</tt> specifies a custom 441type-to-name definition. This is particularly useful for the anonymous tokens 442that ANTLR generates for literal strings in the grammar specification. For 443example, if you refer to the literal <tt>'='</tt> in some parser rule in your 444grammar, ANTLR will add a lexer rule for the literal and give the token a name 445like <tt>T__<i>x</i></tt>, where <tt><i>x</i></tt> is the type's integer value. 446Since this is pretty meaningless to a developer, generated code should add a 447special name definition for type value <tt><i>x</i></tt> with the string 448<tt>"'='"</tt>. 449 450=== Sample TokenScheme Construction 451 452 TokenData = ANTLR3::TokenScheme.new do 453 define_tokens( 454 :INT => 4, 455 :ID => 6, 456 :T__5 => 5, 457 :WS => 7 458 ) 459 460 # note the self:: scoping below is due to the fact that 461 # ruby lexically-scopes constant names instead of 462 # looking up in the current scope 463 register_name(self::T__5, "'='") 464 end 465 466 TokenData::ID # => 6 467 TokenData::T__5 # => 5 468 TokenData.token_name(4) # => 'INT' 469 TokenData.token_name(5) # => "'='" 470 471 class ARecognizerOrSuch < ANTLR3::Parser 472 include TokenData 473 ID # => 6 474 end 475 476== Custom Token Classes and Relationship with Tokens 477 478When a TokenScheme is created, it will define a subclass of ANTLR3::CommonToken 479and assigned it to the constant name +Token+. This token class will both include 480and extend the scheme module. Since token schemes define the private instance 481method <tt>token_name(type)</tt>, instances of the token class are now able to 482provide their type names. The Token method <tt>name</tt> uses the 483<tt>token_name</tt> method to provide the type name as if it were a simple 484attribute without storing the name itself. 485 486When a TokenScheme is included in a recognizer class, the class will now have 487the token types as named constants, a type-to-name map constant +TOKEN_NAMES+, 488and a grammar-specific subclass of ANTLR3::CommonToken assigned to the constant 489Token. Thus, when recognizers need to manufacture tokens, instead of using the 490generic CommonToken class, they can create tokens using the customized Token 491class provided by the token scheme. 492 493If you need to use a token class other than CommonToken, you can pass the class 494as a parameter to TokenScheme.new, which will be used in place of the 495dynamically-created CommonToken subclass. 496 497=end 498 499class TokenScheme < ::Module 500 include TokenFactory 501 502 def self.new( tk_class = nil, &body ) 503 super() do 504 tk_class ||= Class.new( ::ANTLR3::CommonToken ) 505 self.token_class = tk_class 506 507 const_set( :TOKEN_NAMES, ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.clone ) 508 509 @types = ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.invert 510 @unused = ::ANTLR3::Constants::MIN_TOKEN_TYPE 511 512 scheme = self 513 define_method( :token_scheme ) { scheme } 514 define_method( :token_names ) { scheme::TOKEN_NAMES } 515 define_method( :token_name ) do |type| 516 begin 517 token_names[ type ] or super 518 rescue NoMethodError 519 ::ANTLR3::CommonToken.token_name( type ) 520 end 521 end 522 module_function :token_name, :token_names 523 524 include ANTLR3::Constants 525 526 body and module_eval( &body ) 527 end 528 end 529 530 def self.build( *token_names ) 531 token_names = [ token_names ].flatten! 532 token_names.compact! 533 token_names.uniq! 534 tk_class = Class === token_names.first ? token_names.shift : nil 535 value_maps, names = token_names.partition { |i| Hash === i } 536 new( tk_class ) do 537 for value_map in value_maps 538 define_tokens( value_map ) 539 end 540 541 for name in names 542 define_token( name ) 543 end 544 end 545 end 546 547 548 def included( mod ) 549 super 550 mod.extend( self ) 551 end 552 private :included 553 554 attr_reader :unused, :types 555 556 def define_tokens( token_map = {} ) 557 for token_name, token_value in token_map 558 define_token( token_name, token_value ) 559 end 560 return self 561 end 562 563 def define_token( name, value = nil ) 564 name = name.to_s 565 566 if current_value = @types[ name ] 567 # token type has already been defined 568 # raise an error unless value is the same as the current value 569 value ||= current_value 570 unless current_value == value 571 raise NameError.new( 572 "new token type definition ``#{ name } = #{ value }'' conflicts " << 573 "with existing type definition ``#{ name } = #{ current_value }''", name 574 ) 575 end 576 else 577 value ||= @unused 578 if name =~ /^[A-Z]\w*$/ 579 const_set( name, @types[ name ] = value ) 580 else 581 constant = "T__#{ value }" 582 const_set( constant, @types[ constant ] = value ) 583 @types[ name ] = value 584 end 585 register_name( value, name ) unless built_in_type?( value ) 586 end 587 588 value >= @unused and @unused = value + 1 589 return self 590 end 591 592 def register_names( *names ) 593 if names.length == 1 and Hash === names.first 594 names.first.each do |value, name| 595 register_name( value, name ) 596 end 597 else 598 names.each_with_index do |name, i| 599 type_value = Constants::MIN_TOKEN_TYPE + i 600 register_name( type_value, name ) 601 end 602 end 603 end 604 605 def register_name( type_value, name ) 606 name = name.to_s.freeze 607 if token_names.has_key?( type_value ) 608 current_name = token_names[ type_value ] 609 current_name == name and return name 610 611 if current_name == "T__#{ type_value }" 612 # only an anonymous name is registered -- upgrade the name to the full literal name 613 token_names[ type_value ] = name 614 elsif name == "T__#{ type_value }" 615 # ignore name downgrade from literal to anonymous constant 616 return current_name 617 else 618 error = NameError.new( 619 "attempted assignment of token type #{ type_value }" << 620 " to name #{ name } conflicts with existing name #{ current_name }", name 621 ) 622 raise error 623 end 624 else 625 token_names[ type_value ] = name.to_s.freeze 626 end 627 end 628 629 def built_in_type?( type_value ) 630 Constants::BUILT_IN_TOKEN_NAMES.fetch( type_value, false ) and true 631 end 632 633 def token_defined?( name_or_value ) 634 case value 635 when Integer then token_names.has_key?( name_or_value ) 636 else const_defined?( name_or_value.to_s ) 637 end 638 end 639 640 def []( name_or_value ) 641 case name_or_value 642 when Integer then token_names.fetch( name_or_value, nil ) 643 else const_get( name_or_value.to_s ) rescue token_names.index( name_or_value ) 644 end 645 end 646 647 def token_class 648 self::Token 649 end 650 651 def token_class=( klass ) 652 Class === klass or raise( TypeError, "token_class must be a Class" ) 653 Util.silence_warnings do 654 klass < self or klass.send( :include, self ) 655 const_set( :Token, klass ) 656 end 657 end 658 659end 660 661end 662