1# Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
2#
3# Permission to use, copy, modify, and distribute this software and its
4# documentation for any purpose with or without fee is hereby granted,
5# provided that the above copyright notice and this permission notice
6# appear in all copies.
7#
8# THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
11# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
14# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16"""Tokenize DNS master file format"""
17
18import cStringIO
19import sys
20
21import dns.exception
22import dns.name
23import dns.ttl
24
25_DELIMITERS = {
26    ' ' : True,
27    '\t' : True,
28    '\n' : True,
29    ';' : True,
30    '(' : True,
31    ')' : True,
32    '"' : True }
33
34_QUOTING_DELIMITERS = { '"' : True }
35
36EOF = 0
37EOL = 1
38WHITESPACE = 2
39IDENTIFIER = 3
40QUOTED_STRING = 4
41COMMENT = 5
42DELIMITER = 6
43
44class UngetBufferFull(dns.exception.DNSException):
45    """Raised when an attempt is made to unget a token when the unget
46    buffer is full."""
47    pass
48
49class Token(object):
50    """A DNS master file format token.
51
52    @ivar ttype: The token type
53    @type ttype: int
54    @ivar value: The token value
55    @type value: string
56    @ivar has_escape: Does the token value contain escapes?
57    @type has_escape: bool
58    """
59
60    def __init__(self, ttype, value='', has_escape=False):
61        """Initialize a token instance.
62
63        @param ttype: The token type
64        @type ttype: int
65        @ivar value: The token value
66        @type value: string
67        @ivar has_escape: Does the token value contain escapes?
68        @type has_escape: bool
69        """
70        self.ttype = ttype
71        self.value = value
72        self.has_escape = has_escape
73
74    def is_eof(self):
75        return self.ttype == EOF
76
77    def is_eol(self):
78        return self.ttype == EOL
79
80    def is_whitespace(self):
81        return self.ttype == WHITESPACE
82
83    def is_identifier(self):
84        return self.ttype == IDENTIFIER
85
86    def is_quoted_string(self):
87        return self.ttype == QUOTED_STRING
88
89    def is_comment(self):
90        return self.ttype == COMMENT
91
92    def is_delimiter(self):
93        return self.ttype == DELIMITER
94
95    def is_eol_or_eof(self):
96        return (self.ttype == EOL or self.ttype == EOF)
97
98    def __eq__(self, other):
99        if not isinstance(other, Token):
100            return False
101        return (self.ttype == other.ttype and
102                self.value == other.value)
103
104    def __ne__(self, other):
105        if not isinstance(other, Token):
106            return True
107        return (self.ttype != other.ttype or
108                self.value != other.value)
109
110    def __str__(self):
111        return '%d "%s"' % (self.ttype, self.value)
112
113    def unescape(self):
114        if not self.has_escape:
115            return self
116        unescaped = ''
117        l = len(self.value)
118        i = 0
119        while i < l:
120            c = self.value[i]
121            i += 1
122            if c == '\\':
123                if i >= l:
124                    raise dns.exception.UnexpectedEnd
125                c = self.value[i]
126                i += 1
127                if c.isdigit():
128                    if i >= l:
129                        raise dns.exception.UnexpectedEnd
130                    c2 = self.value[i]
131                    i += 1
132                    if i >= l:
133                        raise dns.exception.UnexpectedEnd
134                    c3 = self.value[i]
135                    i += 1
136                    if not (c2.isdigit() and c3.isdigit()):
137                        raise dns.exception.SyntaxError
138                    c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
139            unescaped += c
140        return Token(self.ttype, unescaped)
141
142    # compatibility for old-style tuple tokens
143
144    def __len__(self):
145        return 2
146
147    def __iter__(self):
148        return iter((self.ttype, self.value))
149
150    def __getitem__(self, i):
151        if i == 0:
152            return self.ttype
153        elif i == 1:
154            return self.value
155        else:
156            raise IndexError
157
158class Tokenizer(object):
159    """A DNS master file format tokenizer.
160
161    A token is a (type, value) tuple, where I{type} is an int, and
162    I{value} is a string.  The valid types are EOF, EOL, WHITESPACE,
163    IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
164
165    @ivar file: The file to tokenize
166    @type file: file
167    @ivar ungotten_char: The most recently ungotten character, or None.
168    @type ungotten_char: string
169    @ivar ungotten_token: The most recently ungotten token, or None.
170    @type ungotten_token: (int, string) token tuple
171    @ivar multiline: The current multiline level.  This value is increased
172    by one every time a '(' delimiter is read, and decreased by one every time
173    a ')' delimiter is read.
174    @type multiline: int
175    @ivar quoting: This variable is true if the tokenizer is currently
176    reading a quoted string.
177    @type quoting: bool
178    @ivar eof: This variable is true if the tokenizer has encountered EOF.
179    @type eof: bool
180    @ivar delimiters: The current delimiter dictionary.
181    @type delimiters: dict
182    @ivar line_number: The current line number
183    @type line_number: int
184    @ivar filename: A filename that will be returned by the L{where} method.
185    @type filename: string
186    """
187
188    def __init__(self, f=sys.stdin, filename=None):
189        """Initialize a tokenizer instance.
190
191        @param f: The file to tokenize.  The default is sys.stdin.
192        This parameter may also be a string, in which case the tokenizer
193        will take its input from the contents of the string.
194        @type f: file or string
195        @param filename: the name of the filename that the L{where} method
196        will return.
197        @type filename: string
198        """
199
200        if isinstance(f, str):
201            f = cStringIO.StringIO(f)
202            if filename is None:
203                filename = '<string>'
204        else:
205            if filename is None:
206                if f is sys.stdin:
207                    filename = '<stdin>'
208                else:
209                    filename = '<file>'
210        self.file = f
211        self.ungotten_char = None
212        self.ungotten_token = None
213        self.multiline = 0
214        self.quoting = False
215        self.eof = False
216        self.delimiters = _DELIMITERS
217        self.line_number = 1
218        self.filename = filename
219
220    def _get_char(self):
221        """Read a character from input.
222        @rtype: string
223        """
224
225        if self.ungotten_char is None:
226            if self.eof:
227                c = ''
228            else:
229                c = self.file.read(1)
230                if c == '':
231                    self.eof = True
232                elif c == '\n':
233                    self.line_number += 1
234        else:
235            c = self.ungotten_char
236            self.ungotten_char = None
237        return c
238
239    def where(self):
240        """Return the current location in the input.
241
242        @rtype: (string, int) tuple.  The first item is the filename of
243        the input, the second is the current line number.
244        """
245
246        return (self.filename, self.line_number)
247
248    def _unget_char(self, c):
249        """Unget a character.
250
251        The unget buffer for characters is only one character large; it is
252        an error to try to unget a character when the unget buffer is not
253        empty.
254
255        @param c: the character to unget
256        @type c: string
257        @raises UngetBufferFull: there is already an ungotten char
258        """
259
260        if not self.ungotten_char is None:
261            raise UngetBufferFull
262        self.ungotten_char = c
263
264    def skip_whitespace(self):
265        """Consume input until a non-whitespace character is encountered.
266
267        The non-whitespace character is then ungotten, and the number of
268        whitespace characters consumed is returned.
269
270        If the tokenizer is in multiline mode, then newlines are whitespace.
271
272        @rtype: int
273        """
274
275        skipped = 0
276        while True:
277            c = self._get_char()
278            if c != ' ' and c != '\t':
279                if (c != '\n') or not self.multiline:
280                    self._unget_char(c)
281                    return skipped
282            skipped += 1
283
284    def get(self, want_leading = False, want_comment = False):
285        """Get the next token.
286
287        @param want_leading: If True, return a WHITESPACE token if the
288        first character read is whitespace.  The default is False.
289        @type want_leading: bool
290        @param want_comment: If True, return a COMMENT token if the
291        first token read is a comment.  The default is False.
292        @type want_comment: bool
293        @rtype: Token object
294        @raises dns.exception.UnexpectedEnd: input ended prematurely
295        @raises dns.exception.SyntaxError: input was badly formed
296        """
297
298        if not self.ungotten_token is None:
299            token = self.ungotten_token
300            self.ungotten_token = None
301            if token.is_whitespace():
302                if want_leading:
303                    return token
304            elif token.is_comment():
305                if want_comment:
306                    return token
307            else:
308                return token
309        skipped = self.skip_whitespace()
310        if want_leading and skipped > 0:
311            return Token(WHITESPACE, ' ')
312        token = ''
313        ttype = IDENTIFIER
314        has_escape = False
315        while True:
316            c = self._get_char()
317            if c == '' or c in self.delimiters:
318                if c == '' and self.quoting:
319                    raise dns.exception.UnexpectedEnd
320                if token == '' and ttype != QUOTED_STRING:
321                    if c == '(':
322                        self.multiline += 1
323                        self.skip_whitespace()
324                        continue
325                    elif c == ')':
326                        if not self.multiline > 0:
327                            raise dns.exception.SyntaxError
328                        self.multiline -= 1
329                        self.skip_whitespace()
330                        continue
331                    elif c == '"':
332                        if not self.quoting:
333                            self.quoting = True
334                            self.delimiters = _QUOTING_DELIMITERS
335                            ttype = QUOTED_STRING
336                            continue
337                        else:
338                            self.quoting = False
339                            self.delimiters = _DELIMITERS
340                            self.skip_whitespace()
341                            continue
342                    elif c == '\n':
343                        return Token(EOL, '\n')
344                    elif c == ';':
345                        while 1:
346                            c = self._get_char()
347                            if c == '\n' or c == '':
348                                break
349                            token += c
350                        if want_comment:
351                            self._unget_char(c)
352                            return Token(COMMENT, token)
353                        elif c == '':
354                            if self.multiline:
355                                raise dns.exception.SyntaxError('unbalanced parentheses')
356                            return Token(EOF)
357                        elif self.multiline:
358                            self.skip_whitespace()
359                            token = ''
360                            continue
361                        else:
362                            return Token(EOL, '\n')
363                    else:
364                        # This code exists in case we ever want a
365                        # delimiter to be returned.  It never produces
366                        # a token currently.
367                        token = c
368                        ttype = DELIMITER
369                else:
370                    self._unget_char(c)
371                break
372            elif self.quoting:
373                if c == '\\':
374                    c = self._get_char()
375                    if c == '':
376                        raise dns.exception.UnexpectedEnd
377                    if c.isdigit():
378                        c2 = self._get_char()
379                        if c2 == '':
380                            raise dns.exception.UnexpectedEnd
381                        c3 = self._get_char()
382                        if c == '':
383                            raise dns.exception.UnexpectedEnd
384                        if not (c2.isdigit() and c3.isdigit()):
385                            raise dns.exception.SyntaxError
386                        c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
387                elif c == '\n':
388                    raise dns.exception.SyntaxError('newline in quoted string')
389            elif c == '\\':
390                #
391                # It's an escape.  Put it and the next character into
392                # the token; it will be checked later for goodness.
393                #
394                token += c
395                has_escape = True
396                c = self._get_char()
397                if c == '' or c == '\n':
398                    raise dns.exception.UnexpectedEnd
399            token += c
400        if token == '' and ttype != QUOTED_STRING:
401            if self.multiline:
402                raise dns.exception.SyntaxError('unbalanced parentheses')
403            ttype = EOF
404        return Token(ttype, token, has_escape)
405
406    def unget(self, token):
407        """Unget a token.
408
409        The unget buffer for tokens is only one token large; it is
410        an error to try to unget a token when the unget buffer is not
411        empty.
412
413        @param token: the token to unget
414        @type token: Token object
415        @raises UngetBufferFull: there is already an ungotten token
416        """
417
418        if not self.ungotten_token is None:
419            raise UngetBufferFull
420        self.ungotten_token = token
421
422    def next(self):
423        """Return the next item in an iteration.
424        @rtype: (int, string)
425        """
426
427        token = self.get()
428        if token.is_eof():
429            raise StopIteration
430        return token
431
432    def __iter__(self):
433        return self
434
435    # Helpers
436
437    def get_int(self):
438        """Read the next token and interpret it as an integer.
439
440        @raises dns.exception.SyntaxError:
441        @rtype: int
442        """
443
444        token = self.get().unescape()
445        if not token.is_identifier():
446            raise dns.exception.SyntaxError('expecting an identifier')
447        if not token.value.isdigit():
448            raise dns.exception.SyntaxError('expecting an integer')
449        return int(token.value)
450
451    def get_uint8(self):
452        """Read the next token and interpret it as an 8-bit unsigned
453        integer.
454
455        @raises dns.exception.SyntaxError:
456        @rtype: int
457        """
458
459        value = self.get_int()
460        if value < 0 or value > 255:
461            raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
462        return value
463
464    def get_uint16(self):
465        """Read the next token and interpret it as a 16-bit unsigned
466        integer.
467
468        @raises dns.exception.SyntaxError:
469        @rtype: int
470        """
471
472        value = self.get_int()
473        if value < 0 or value > 65535:
474            raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
475        return value
476
477    def get_uint32(self):
478        """Read the next token and interpret it as a 32-bit unsigned
479        integer.
480
481        @raises dns.exception.SyntaxError:
482        @rtype: int
483        """
484
485        token = self.get().unescape()
486        if not token.is_identifier():
487            raise dns.exception.SyntaxError('expecting an identifier')
488        if not token.value.isdigit():
489            raise dns.exception.SyntaxError('expecting an integer')
490        value = long(token.value)
491        if value < 0 or value > 4294967296L:
492            raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
493        return value
494
495    def get_string(self, origin=None):
496        """Read the next token and interpret it as a string.
497
498        @raises dns.exception.SyntaxError:
499        @rtype: string
500        """
501
502        token = self.get().unescape()
503        if not (token.is_identifier() or token.is_quoted_string()):
504            raise dns.exception.SyntaxError('expecting a string')
505        return token.value
506
507    def get_identifier(self, origin=None):
508        """Read the next token and raise an exception if it is not an identifier.
509
510        @raises dns.exception.SyntaxError:
511        @rtype: string
512        """
513
514        token = self.get().unescape()
515        if not token.is_identifier():
516            raise dns.exception.SyntaxError('expecting an identifier')
517        return token.value
518
519    def get_name(self, origin=None):
520        """Read the next token and interpret it as a DNS name.
521
522        @raises dns.exception.SyntaxError:
523        @rtype: dns.name.Name object"""
524
525        token = self.get()
526        if not token.is_identifier():
527            raise dns.exception.SyntaxError('expecting an identifier')
528        return dns.name.from_text(token.value, origin)
529
530    def get_eol(self):
531        """Read the next token and raise an exception if it isn't EOL or
532        EOF.
533
534        @raises dns.exception.SyntaxError:
535        @rtype: string
536        """
537
538        token = self.get()
539        if not token.is_eol_or_eof():
540            raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
541        return token.value
542
543    def get_ttl(self):
544        token = self.get().unescape()
545        if not token.is_identifier():
546            raise dns.exception.SyntaxError('expecting an identifier')
547        return dns.ttl.from_text(token.value)
548