cpp.py revision 1198fd38645ff94bf48daae10f8b74903444badc
1# a glorified C pre-processor parser
2
3import sys, re, string
4from utils import *
5from defaults import *
6
7debugTokens             = False
8debugDirectiveTokenizer = False
9debugLineParsing        = False
10debugCppExpr            = False
11debugOptimIf01          = False
12
13#####################################################################################
14#####################################################################################
15#####                                                                           #####
16#####           C P P   T O K E N S                                             #####
17#####                                                                           #####
18#####################################################################################
19#####################################################################################
20
21# the list of supported C-preprocessor tokens
22# plus a couple of C tokens as well
23tokEOF       = "\0"
24tokLN        = "\n"
25tokSTRINGIFY = "#"
26tokCONCAT    = "##"
27tokLOGICAND  = "&&"
28tokLOGICOR   = "||"
29tokSHL       = "<<"
30tokSHR       = ">>"
31tokEQUAL     = "=="
32tokNEQUAL    = "!="
33tokLT        = "<"
34tokLTE       = "<="
35tokGT        = ">"
36tokGTE       = ">="
37tokELLIPSIS  = "..."
38tokSPACE     = " "
39tokDEFINED   = "defined"
40tokLPAREN    = "("
41tokRPAREN    = ")"
42tokNOT       = "!"
43tokPLUS      = "+"
44tokMINUS     = "-"
45tokMULTIPLY  = "*"
46tokDIVIDE    = "/"
47tokMODULUS   = "%"
48tokBINAND    = "&"
49tokBINOR     = "|"
50tokBINXOR    = "^"
51tokCOMMA     = ","
52tokLBRACE    = "{"
53tokRBRACE    = "}"
54tokARROW     = "->"
55tokINCREMENT = "++"
56tokDECREMENT = "--"
57tokNUMBER    = "<number>"
58tokIDENT     = "<ident>"
59tokSTRING    = "<string>"
60
61class Token:
62    """a simple class to hold information about a given token.
63       each token has a position in the source code, as well as
64       an 'id' and a 'value'. the id is a string that identifies
65       the token's class, while the value is the string of the
66       original token itself.
67
68       for example, the tokenizer concatenates a series of spaces
69       and tabs as a single tokSPACE id, whose value if the original
70       spaces+tabs sequence."""
71
72    def __init__(self):
73        self.id     = None
74        self.value  = None
75        self.lineno = 0
76        self.colno  = 0
77
78    def set(self,id,val=None):
79        self.id = id
80        if val:
81            self.value = val
82        else:
83            self.value = id
84        return None
85
86    def copyFrom(self,src):
87        self.id     = src.id
88        self.value  = src.value
89        self.lineno = src.lineno
90        self.colno  = src.colno
91
92    def __repr__(self):
93        if self.id == tokIDENT:
94            return "(ident %s)" % self.value
95        if self.id == tokNUMBER:
96            return "(number %s)" % self.value
97        if self.id == tokSTRING:
98            return "(string '%s')" % self.value
99        if self.id == tokLN:
100            return "<LN>"
101        if self.id == tokEOF:
102            return "<EOF>"
103        if self.id == tokSPACE and self.value == "\\":
104            # this corresponds to a trailing \ that was transformed into a tokSPACE
105            return "<\\>"
106
107        return self.id
108
109    def __str__(self):
110        if self.id == tokIDENT:
111            return self.value
112        if self.id == tokNUMBER:
113            return self.value
114        if self.id == tokSTRING:
115            return self.value
116        if self.id == tokEOF:
117            return "<EOF>"
118        if self.id == tokSPACE:
119            if self.value == "\\":  # trailing \
120                return "\\\n"
121            else:
122                return self.value
123
124        return self.id
125
126class BadExpectedToken(Exception):
127    def __init__(self,msg):
128        print msg
129
130
131#####################################################################################
132#####################################################################################
133#####                                                                           #####
134#####           C P P   T O K E N I Z E R                                       #####
135#####                                                                           #####
136#####################################################################################
137#####################################################################################
138
139# list of long symbols, i.e. those that take more than one characters
140cppLongSymbols = [ tokCONCAT, tokLOGICAND, tokLOGICOR, tokSHL, tokSHR, tokELLIPSIS, tokEQUAL,\
141                   tokNEQUAL, tokLTE, tokGTE, tokARROW, tokINCREMENT, tokDECREMENT ]
142
143class CppTokenizer:
144    """an abstract class used to convert some input text into a list
145       of tokens. real implementations follow and differ in the format
146       of the input text only"""
147
148    def __init__(self):
149        """initialize a new CppTokenizer object"""
150        self.eof  = False  # end of file reached ?
151        self.text = None   # content of current line, with final \n stripped
152        self.line = 0      # number of current line
153        self.pos  = 0      # current character position in current line
154        self.len  = 0      # length of current line text
155        self.held = Token()
156
157    def setLineText(self,line):
158        """set the content of the (next) current line. should be called
159           by fillLineText() in derived classes"""
160        self.text = line
161        self.len  = len(line)
162        self.pos  = 0
163
164    def fillLineText(self):
165        """refresh the content of 'line' with a new line of input"""
166        # to be overriden
167        self.eof = True
168
169    def markPos(self,tok):
170        """mark the position of the current token in the source file"""
171        if self.eof or self.pos > self.len:
172            tok.lineno = self.line + 1
173            tok.colno  = 0
174        else:
175            tok.lineno = self.line
176            tok.colno  = self.pos
177
178    def peekChar(self):
179        """return the current token under the cursor without moving it"""
180        if self.eof:
181            return tokEOF
182
183        if self.pos > self.len:
184            self.pos   = 0
185            self.line += 1
186            self.fillLineText()
187            if self.eof:
188                return tokEOF
189
190        if self.pos == self.len:
191            return tokLN
192        else:
193            return self.text[self.pos]
194
195    def peekNChar(self,n):
196        """try to peek the next n chars on the same line"""
197        if self.pos + n > self.len:
198            return None
199        return self.text[self.pos:self.pos+n]
200
201    def skipChar(self):
202        """increment the token cursor position"""
203        if not self.eof:
204            self.pos += 1
205
206    def skipNChars(self,n):
207        if self.pos + n <= self.len:
208            self.pos += n
209        else:
210            while n > 0:
211                self.skipChar()
212                n -= 1
213
214    def nextChar(self):
215        """retrieve the token at the current cursor position, then skip it"""
216        result = self.peekChar()
217        self.skipChar()
218        return  result
219
220    def getEscape(self):
221        # try to get all characters after a backslash (\)
222        result = self.nextChar()
223        if result == "0":
224            # octal number ?
225            num = self.peekNChar(3)
226            if num != None:
227                isOctal = True
228                for d in num:
229                    if not d in "01234567":
230                        isOctal = False
231                        break
232                if isOctal:
233                    result += num
234                    self.skipNChars(3)
235        elif result == "x" or result == "X":
236            # hex number ?
237            num = self.peekNChar(2)
238            if num != None:
239                isHex = True
240                for d in num:
241                    if not d in "012345678abcdefABCDEF":
242                        isHex = False
243                        break
244                if isHex:
245                    result += num
246                    self.skipNChars(2)
247        elif result == "u" or result == "U":
248            # unicode char ?
249            num = self.peekNChar(4)
250            if num != None:
251                isHex = True
252                for d in num:
253                    if not d in "012345678abcdefABCDEF":
254                        isHex = False
255                        break
256                if isHex:
257                    result += num
258                    self.skipNChars(4)
259
260        return result
261
262    def nextRealToken(self,tok):
263        """return next CPP token, used internally by nextToken()"""
264        c = self.nextChar()
265        if c == tokEOF or c == tokLN:
266            return tok.set(c)
267
268        if c == '/':
269            c = self.peekChar()
270            if c == '/':   # C++ comment line
271                self.skipChar()
272                while 1:
273                    c = self.nextChar()
274                    if c == tokEOF or c == tokLN:
275                        break
276                return tok.set(tokLN)
277            if c == '*':   # C comment start
278                self.skipChar()
279                value = "/*"
280                prev_c = None
281                while 1:
282                    c = self.nextChar()
283                    if c == tokEOF:
284                        return tok.set(tokEOF,value)
285                    if c == '/' and prev_c == '*':
286                        break
287                    prev_c = c
288                    value += c
289
290                value += "/"
291                return tok.set(tokSPACE,value)
292            c = '/'
293
294        if c.isspace():
295            while 1:
296                c2 = self.peekChar()
297                if c2 == tokLN or not c2.isspace():
298                    break
299                c += c2
300                self.skipChar()
301            return tok.set(tokSPACE,c)
302
303        if c == '\\':
304            if debugTokens:
305                print "nextRealToken: \\ found, next token is '%s'" % repr(self.peekChar())
306            if self.peekChar() == tokLN:   # trailing \
307                # eat the tokLN
308                self.skipChar()
309                # we replace a trailing \ by a tokSPACE whose value is
310                # simply "\\". this allows us to detect them later when
311                # needed.
312                return tok.set(tokSPACE,"\\")
313            else:
314                # treat as a single token here ?
315                c +=self.getEscape()
316                return tok.set(c)
317
318        if c == "'":  # chars
319            c2 = self.nextChar()
320            c += c2
321            if c2 == '\\':
322                c += self.getEscape()
323
324            while 1:
325                c2 = self.nextChar()
326                if c2 == tokEOF:
327                    break
328                c += c2
329                if c2 == "'":
330                    break
331
332            return tok.set(tokSTRING, c)
333
334        if c == '"':  # strings
335            quote = 0
336            while 1:
337                c2  = self.nextChar()
338                if c2 == tokEOF:
339                    return tok.set(tokSTRING,c)
340
341                c += c2
342                if not quote:
343                    if c2 == '"':
344                        return tok.set(tokSTRING,c)
345                    if c2 == "\\":
346                        quote = 1
347                else:
348                    quote = 0
349
350        if c >= "0" and c <= "9":  # integers ?
351            while 1:
352                c2 = self.peekChar()
353                if c2 == tokLN or (not c2.isalnum() and c2 != "_"):
354                    break
355                c += c2
356                self.skipChar()
357            return tok.set(tokNUMBER,c)
358
359        if c.isalnum() or c == "_":  # identifiers ?
360            while 1:
361                c2 = self.peekChar()
362                if c2 == tokLN or (not c2.isalnum() and c2 != "_"):
363                    break
364                c += c2
365                self.skipChar()
366            if c == tokDEFINED:
367                return tok.set(tokDEFINED)
368            else:
369                return tok.set(tokIDENT,c)
370
371        # check special symbols
372        for sk in cppLongSymbols:
373            if c == sk[0]:
374                sklen = len(sk[1:])
375                if self.pos + sklen <= self.len and \
376                   self.text[self.pos:self.pos+sklen] == sk[1:]:
377                    self.pos += sklen
378                    return tok.set(sk)
379
380        return tok.set(c)
381
382    def nextToken(self,tok):
383        """return the next token from the input text. this function
384           really updates 'tok', and does not return a new one"""
385        self.markPos(tok)
386        self.nextRealToken(tok)
387
388    def getToken(self):
389        tok = Token()
390        self.nextToken(tok)
391        if debugTokens:
392            print "getTokens: %s" % repr(tok)
393        return tok
394
395    def toTokenList(self):
396        """convert the input text of a CppTokenizer into a direct
397           list of token objects. tokEOF is stripped from the result"""
398        result = []
399        while 1:
400            tok = Token()
401            self.nextToken(tok)
402            if tok.id == tokEOF:
403                break
404            result.append(tok)
405        return result
406
407class CppLineTokenizer(CppTokenizer):
408    """a CppTokenizer derived class that accepts a single line of text as input"""
409    def __init__(self,line,lineno=1):
410        CppTokenizer.__init__(self)
411        self.line = lineno
412        self.setLineText(line)
413
414
415class CppLinesTokenizer(CppTokenizer):
416    """a CppTokenizer derived class that accepts a list of texdt lines as input.
417       the lines must not have a trailing \n"""
418    def __init__(self,lines=[],lineno=1):
419        """initialize a CppLinesTokenizer. you can later add lines using addLines()"""
420        CppTokenizer.__init__(self)
421        self.line  = lineno
422        self.lines = lines
423        self.index = 0
424        self.count = len(lines)
425
426        if self.count > 0:
427            self.fillLineText()
428        else:
429            self.eof = True
430
431    def addLine(self,line):
432        """add a line to a CppLinesTokenizer. this can be done after tokenization
433           happens"""
434        if self.count == 0:
435            self.setLineText(line)
436            self.index = 1
437        self.lines.append(line)
438        self.count += 1
439        self.eof    = False
440
441    def fillLineText(self):
442        if self.index < self.count:
443            self.setLineText(self.lines[self.index])
444            self.index += 1
445        else:
446            self.eof = True
447
448
449class CppFileTokenizer(CppTokenizer):
450    def __init__(self,file,lineno=1):
451        CppTokenizer.__init__(self)
452        self.file = file
453        self.line = lineno
454
455    def fillLineText(self):
456        line = self.file.readline()
457        if len(line) > 0:
458            if line[-1] == '\n':
459                line = line[:-1]
460            if len(line) > 0 and line[-1] == "\r":
461                line = line[:-1]
462            self.setLineText(line)
463        else:
464            self.eof = True
465
466# Unit testing
467#
468class CppTokenizerTester:
469    """a class used to test CppTokenizer classes"""
470    def __init__(self,tokenizer=None):
471        self.tokenizer = tokenizer
472        self.token     = Token()
473
474    def setTokenizer(self,tokenizer):
475        self.tokenizer = tokenizer
476
477    def expect(self,id):
478        self.tokenizer.nextToken(self.token)
479        tokid = self.token.id
480        if tokid == id:
481            return
482        if self.token.value == id and (tokid == tokIDENT or tokid == tokNUMBER):
483            return
484        raise BadExpectedToken, "###  BAD TOKEN: '%s' expecting '%s'" % (self.token.id,id)
485
486    def expectToken(self,id,line,col):
487        self.expect(id)
488        if self.token.lineno != line:
489            raise BadExpectedToken, "###  BAD LINENO: token '%s' got '%d' expecting '%d'" % (id,self.token.lineno,line)
490        if self.token.colno != col:
491            raise BadExpectedToken, "###  BAD COLNO: '%d' expecting '%d'" % (self.token.colno,col)
492
493    def expectTokenVal(self,id,value,line,col):
494        self.expectToken(id,line,col)
495        if self.token.value != value:
496            raise BadExpectedToken, "###  BAD VALUE: '%s' expecting '%s'" % (self.token.value,value)
497
498    def expectList(self,list):
499        for item in list:
500            self.expect(item)
501
502def test_CppTokenizer():
503    tester = CppTokenizerTester()
504
505    tester.setTokenizer( CppLineTokenizer("#an/example  && (01923_xy)") )
506    tester.expectList( ["#", "an", "/", "example", tokSPACE, tokLOGICAND, tokSPACE, tokLPAREN, "01923_xy", \
507                       tokRPAREN, tokLN, tokEOF] )
508
509    tester.setTokenizer( CppLineTokenizer("FOO(BAR) && defined(BAZ)") )
510    tester.expectList( ["FOO", tokLPAREN, "BAR", tokRPAREN, tokSPACE, tokLOGICAND, tokSPACE,
511                        tokDEFINED, tokLPAREN, "BAZ", tokRPAREN, tokLN, tokEOF] )
512
513    tester.setTokenizer( CppLinesTokenizer( ["/*", "#", "*/"] ) )
514    tester.expectList( [ tokSPACE, tokLN, tokEOF ] )
515
516    tester.setTokenizer( CppLinesTokenizer( ["first", "second"] ) )
517    tester.expectList( [ "first", tokLN, "second", tokLN, tokEOF ] )
518
519    tester.setTokenizer( CppLinesTokenizer( ["first second", "  third"] ) )
520    tester.expectToken( "first", 1, 0 )
521    tester.expectToken( tokSPACE, 1, 5 )
522    tester.expectToken( "second", 1, 6 )
523    tester.expectToken( tokLN, 1, 12 )
524    tester.expectToken( tokSPACE, 2, 0 )
525    tester.expectToken( "third", 2, 2 )
526
527    tester.setTokenizer( CppLinesTokenizer( [ "boo /* what the", "hell */" ] ) )
528    tester.expectList( [ "boo", tokSPACE ] )
529    tester.expectTokenVal( tokSPACE, "/* what the\nhell */", 1, 4 )
530    tester.expectList( [ tokLN, tokEOF ] )
531
532    tester.setTokenizer( CppLinesTokenizer( [ "an \\", " example" ] ) )
533    tester.expectToken( "an", 1, 0 )
534    tester.expectToken( tokSPACE, 1, 2 )
535    tester.expectTokenVal( tokSPACE, "\\", 1, 3 )
536    tester.expectToken( tokSPACE, 2, 0 )
537    tester.expectToken( "example", 2, 1 )
538    tester.expectToken( tokLN, 2, 8 )
539
540    return True
541
542
543#####################################################################################
544#####################################################################################
545#####                                                                           #####
546#####           C P P   E X P R E S S I O N S                                   #####
547#####                                                                           #####
548#####################################################################################
549#####################################################################################
550
551class CppExpr:
552    """a class that models the condition of #if directives into
553        an expression tree. each node in the tree is of the form (op,arg) or (op,arg1,arg2)
554        where "op" is a string describing the operation"""
555
556    unaries  = [ "!", "~" ]
557    binaries = [ "+", "-", "<", "<=", ">=", ">", "&&", "||", "*", "/", "%", "&", "|", "^", "<<", ">>", "==", "!=", "?", ":" ]
558    precedences = {
559        "?": 1, ":": 1,
560        "||": 2,
561        "&&": 3,
562        "|": 4,
563        "^": 5,
564        "&": 6,
565        "==": 7, "!=": 7,
566        "<": 8, "<=": 8, ">": 8, ">=": 8,
567        "<<": 9, ">>": 9,
568        "+": 10, "-": 10,
569        "*": 11, "/": 11, "%": 11,
570        "!": 12, "~": 12
571    }
572
573    re_cpp_constant = re.compile(r"((\d|\w|_)+)")
574
575    def __init__(self, tokens):
576        """initialize a CppExpr. 'tokens' must be a CppToken list"""
577        self.tok  = tokens
578        self.n    = len(tokens)
579        self.i    = 0
580        if debugCppExpr:
581            print "CppExpr: trying to parse %s" % repr(tokens)
582        self.expr = self.parseExpression(0)
583        if debugCppExpr:
584            print "CppExpr: got " + repr(self.expr)
585        if self.i != self.n:
586            print 'crap at end of input (%d != %d): %s' % (self.i, self.n, repr(tokens))
587            raise
588
589
590    def throw(self, exception, msg):
591        if self.i < self.n:
592            tok = self.tok[self.i]
593            print "%d:%d: %s" % (tok.lineno,tok.colno,msg)
594        else:
595            print "EOF: %s" % msg
596        raise exception(msg)
597
598
599    def skip_spaces(self):
600        """skip spaces in input token list"""
601        while self.i < self.n:
602            t = self.tok[self.i]
603            if t.id != tokSPACE and t.id != tokLN:
604                break
605            self.i += 1
606
607
608    def expectId(self, id):
609        """check that a given token id is at the current position, then skip over it"""
610        self.skip_spaces()
611        if self.i >= self.n or self.tok[self.i].id != id:
612            self.throw(BadExpectedToken,self.i,"### expecting '%s' in expression, got '%s'" % (id, self.tok[self.i].id))
613        self.i += 1
614
615
616    def expectIdent(self):
617        self.skip_spaces()
618        if self.i >= self.n or self.tok[self.i].id != tokIDENT:
619            self.throw(BadExpectedToken, self.i,"### expecting identifier in expression, got '%s'" % (id, self.tok[self.i].id))
620        self.i += 1
621
622
623    def is_decimal(self):
624        v = self.tok[self.i].value[:]
625        while len(v) > 0 and v[-1] in "ULul":
626            v = v[:-1]
627        for digit in v:
628            if not digit.isdigit():
629                return None
630
631        self.i += 1
632        return ("int", string.atoi(v))
633
634
635    def is_hexadecimal(self):
636        v = self.tok[self.i].value[:]
637        while len(v) > 0 and v[-1] in "ULul":
638            v = v[:-1]
639        if len(v) > 2 and (v[0:2] == "0x" or v[0:2] == "0X"):
640            for digit in v[2:]:
641                if not digit in "0123456789abcdefABCDEF":
642                    return None
643
644            # for a hex expression tuple, the argument
645            # is the value as an integer
646            self.i += 1
647            return ("hex", int(v[2:], 16))
648
649        return None
650
651
652    def is_integer(self):
653        if self.tok[self.i].id != tokNUMBER:
654            return None
655
656        c = self.is_decimal()
657        if c: return c
658
659        c = self.is_hexadecimal()
660        if c: return c
661
662        return None
663
664
665    def is_number(self):
666        t = self.tok[self.i]
667        if t.id == tokMINUS and self.i+1 < self.n:
668            self.i += 1
669            c = self.is_integer()
670            if c:
671                op, val  = c
672                return (op, -val)
673        if t.id == tokPLUS and self.i+1 < self.n:
674            c = self.is_integer()
675            if c: return c
676
677        return self.is_integer()
678
679
680    def is_defined(self):
681        t = self.tok[self.i]
682        if t.id != tokDEFINED:
683            return None
684
685        # we have the defined keyword, check the rest
686        self.i += 1
687        self.skip_spaces()
688        use_parens = 0
689        if self.i < self.n and self.tok[self.i].id == tokLPAREN:
690            use_parens = 1
691            self.i += 1
692            self.skip_spaces()
693
694        if self.i >= self.n:
695            self.throw(CppConstantExpected,i,"### 'defined' must be followed  by macro name or left paren")
696
697        t = self.tok[self.i]
698        if t.id != tokIDENT:
699            self.throw(CppConstantExpected,i,"### 'defined' must be followed by macro name")
700
701        self.i += 1
702        if use_parens:
703            self.expectId(tokRPAREN)
704
705        return ("defined", t.value)
706
707
708    def is_call_or_ident(self):
709        self.skip_spaces()
710        if self.i >= self.n:
711            return None
712
713        t = self.tok[self.i]
714        if t.id != tokIDENT:
715            return None
716
717        name = t.value
718
719        self.i += 1
720        self.skip_spaces()
721        if self.i >= self.n or self.tok[self.i].id != tokLPAREN:
722            return ("ident", name)
723
724        params    = []
725        depth     = 1
726        self.i += 1
727        j  = self.i
728        while self.i < self.n:
729            id = self.tok[self.i].id
730            if id == tokLPAREN:
731                depth += 1
732            elif depth == 1 and (id == tokCOMMA or id == tokRPAREN):
733                while j < self.i and self.tok[j].id == tokSPACE:
734                    j += 1
735                k = self.i
736                while k > j and self.tok[k-1].id == tokSPACE:
737                    k -= 1
738                param = self.tok[j:k]
739                params.append(param)
740                if id == tokRPAREN:
741                    break
742                j = self.i+1
743            elif id == tokRPAREN:
744                depth -= 1
745            self.i += 1
746
747        if self.i >= self.n:
748            return None
749
750        self.i += 1
751        return ("call", (name, params))
752
753
754    # Implements the "precedence climbing" algorithm from http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm.
755    # The "classic" algorithm would be fine if we were using a tool to generate the parser, but we're not.
756    # Dijkstra's "shunting yard" algorithm hasn't been necessary yet.
757    def parseExpression(self, minPrecedence):
758        self.skip_spaces()
759        if self.i >= self.n:
760            return None
761
762        node = self.parsePrimary()
763        while self.token() != None and self.isBinary(self.token()) and self.precedence(self.token()) >= minPrecedence:
764            op = self.token()
765            self.nextToken()
766            rhs = self.parseExpression(self.precedence(op) + 1)
767            node = (op.id, node, rhs)
768
769        return node
770
771
772    def parsePrimary(self):
773        op = self.token()
774        if self.isUnary(op):
775            self.nextToken()
776            return (op.id, self.parseExpression(self.precedence(op)))
777
778        primary = None
779        if op.id == tokLPAREN:
780            self.nextToken()
781            primary = self.parseExpression(0)
782            self.expectId(tokRPAREN)
783        elif op.id == "?":
784            self.nextToken()
785            primary = self.parseExpression(0)
786            self.expectId(":")
787        elif op.id == tokNUMBER:
788            primary = self.is_number()
789        elif op.id == tokIDENT:
790            primary = self.is_call_or_ident()
791        elif op.id == tokDEFINED:
792            primary = self.is_defined()
793        else:
794            self.throw(BadExpectedToken, "didn't expect to see a %s in factor" % (self.tok[self.i].id))
795
796        self.skip_spaces()
797
798        return primary;
799
800
801    def isBinary(self, token):
802        return token.id in self.binaries
803
804
805    def isUnary(self, token):
806        return token.id in self.unaries
807
808
809    def precedence(self, token):
810        return self.precedences.get(token.id)
811
812
813    def token(self):
814        if self.i >= self.n:
815            return None
816        return self.tok[self.i]
817
818
819    def nextToken(self):
820        self.i += 1
821        self.skip_spaces()
822        if self.i >= self.n:
823            return None
824        return self.tok[self.i]
825
826
827    def dump_node(self, e):
828        op = e[0]
829        line = "(" + op
830        if op == "int":
831            line += " %d)" % e[1]
832        elif op == "hex":
833            line += " 0x%x)" % e[1]
834        elif op == "ident":
835            line += " %s)" % e[1]
836        elif op == "defined":
837            line += " %s)" % e[1]
838        elif op == "call":
839            arg = e[1]
840            line += " %s [" % arg[0]
841            prefix = ""
842            for param in arg[1]:
843                par = ""
844                for tok in param:
845                    par += str(tok)
846                line += "%s%s" % (prefix, par)
847                prefix = ","
848            line += "])"
849        elif op in CppExpr.unaries:
850            line += " %s)" % self.dump_node(e[1])
851        elif op in CppExpr.binaries:
852            line += " %s %s)" % (self.dump_node(e[1]), self.dump_node(e[2]))
853        else:
854            line += " ?%s)" % repr(e[1])
855
856        return line
857
858    def __repr__(self):
859        return self.dump_node(self.expr)
860
861    def source_node(self, e):
862        op = e[0]
863        if op == "int":
864            return "%d" % e[1]
865        if op == "hex":
866            return "0x%x" % e[1]
867        if op == "ident":
868            # XXX: should try to expand
869            return e[1]
870        if op == "defined":
871            return "defined(%s)" % e[1]
872
873        prec = CppExpr.precedences.get(op,1000)
874        arg  = e[1]
875        if op in CppExpr.unaries:
876            arg_src = self.source_node(arg)
877            arg_op  = arg[0]
878            arg_prec = CppExpr.precedences.get(arg[0],1000)
879            if arg_prec < prec:
880                return "!(" + arg_src + ")"
881            else:
882                return "!" + arg_src
883        if op in CppExpr.binaries:
884            arg2     = e[2]
885            arg1_op  = arg[0]
886            arg2_op  = arg2[0]
887            arg1_src = self.source_node(arg)
888            arg2_src = self.source_node(arg2)
889            if CppExpr.precedences.get(arg1_op,1000) < prec:
890                arg1_src = "(%s)" % arg1_src
891            if CppExpr.precedences.get(arg2_op,1000) < prec:
892                arg2_src = "(%s)" % arg2_src
893
894            return "%s %s %s" % (arg1_src, op, arg2_src)
895        return "???"
896
897    def __str__(self):
898        return self.source_node(self.expr)
899
900    def int_node(self,e):
901        if e[0] == "int":
902            return e[1]
903        elif e[1] == "hex":
904            return int(e[1],16)
905        else:
906            return None
907
908    def toInt(self):
909        return self.int_node(self.expr)
910
911    def optimize_node(self, e, macros={}):
912        op = e[0]
913        if op == "defined":
914            op, name = e
915            if macros.has_key(name):
916                if macros[name] == kCppUndefinedMacro:
917                    return ("int", 0)
918                else:
919                    try:
920                        value = int(macros[name])
921                        return ("int", value)
922                    except:
923                        return ("defined", macros[name])
924
925            if kernel_remove_config_macros and name.startswith("CONFIG_"):
926                return ("int", 0)
927
928            return e
929
930        elif op == "ident":
931            op, name = e
932            if macros.has_key(name):
933                try:
934                    value = int(macros[name])
935                    expanded = ("int", value)
936                except:
937                    expanded = ("ident", macros[name])
938                return self.optimize_node(expanded, macros)
939            return e
940
941        elif op == "!":
942            op, v = e
943            v = self.optimize_node(v, macros)
944            if v[0] == "int":
945                if v[1] == 0:
946                    return ("int", 1)
947                else:
948                    return ("int", 0)
949            return ('!', v)
950
951        elif op == "&&":
952            op, l, r = e
953            l  = self.optimize_node(l, macros)
954            r  = self.optimize_node(r, macros)
955            li = self.int_node(l)
956            ri = self.int_node(r)
957            if li != None:
958                if li == 0:
959                    return ("int", 0)
960                else:
961                    return r
962            elif ri != None:
963                if ri == 0:
964                    return ("int", 0)
965                else:
966                    return l
967            return (op, l, r)
968
969        elif op == "||":
970            op, l, r = e
971            l  = self.optimize_node(l, macros)
972            r  = self.optimize_node(r, macros)
973            li = self.int_node(l)
974            ri = self.int_node(r)
975            if li != None:
976                if li == 0:
977                    return r
978                else:
979                    return ("int", 1)
980            elif ri != None:
981                if ri == 0:
982                    return l
983                else:
984                    return ("int", 1)
985            return (op, l, r)
986
987        else:
988            return e
989
990    def optimize(self,macros={}):
991        self.expr = self.optimize_node(self.expr, macros)
992
993    def is_equal_node(self,e1,e2):
994        if e1[0] != e2[0] or len(e1) != len(e2):
995            return False
996
997        op = e1[0]
998        if op == "int" or op == "hex" or op == "!" or op == "defined":
999            return e1[0] == e2[0]
1000
1001        return self.is_equal_node(e1[1],e2[1]) and self.is_equal_node(e1[2],e2[2])
1002
1003    def is_equal(self,other):
1004        return self.is_equal_node(self.expr,other.expr)
1005
1006def test_cpp_expr(expr, expected):
1007    e = CppExpr( CppLineTokenizer( expr ).toTokenList() )
1008    s1 = repr(e)
1009    if s1 != expected:
1010        print "[FAIL]: expression '%s' generates '%s', should be '%s'" % (expr, s1, expected)
1011        global failure_count
1012        failure_count += 1
1013
1014def test_cpp_expr_optim(expr, expected, macros={}):
1015    e = CppExpr( CppLineTokenizer( expr ).toTokenList() )
1016    e.optimize(macros)
1017    s1 = repr(e)
1018    if s1 != expected:
1019        print "[FAIL]: optimized expression '%s' generates '%s' with macros %s, should be '%s'" % (expr, s1, macros, expected)
1020        global failure_count
1021        failure_count += 1
1022
1023def test_cpp_expr_source(expr, expected):
1024    e = CppExpr( CppLineTokenizer( expr ).toTokenList() )
1025    s1 = str(e)
1026    if s1 != expected:
1027        print "[FAIL]: source expression '%s' generates '%s', should be '%s'" % (expr, s1, expected)
1028        global failure_count
1029        failure_count += 1
1030
1031def test_CppExpr():
1032    test_cpp_expr("0", "(int 0)")
1033    test_cpp_expr("1", "(int 1)")
1034    test_cpp_expr("(0)", "(int 0)")
1035    test_cpp_expr("1 && 1", "(&& (int 1) (int 1))")
1036    test_cpp_expr("1 && 0", "(&& (int 1) (int 0))")
1037    test_cpp_expr("EXAMPLE", "(ident EXAMPLE)")
1038    test_cpp_expr("EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))")
1039    test_cpp_expr("defined(EXAMPLE)", "(defined EXAMPLE)")
1040    test_cpp_expr("defined ( EXAMPLE ) ", "(defined EXAMPLE)")
1041    test_cpp_expr("!defined(EXAMPLE)", "(! (defined EXAMPLE))")
1042    test_cpp_expr("defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))")
1043    test_cpp_expr("FOO(BAR)", "(call FOO [BAR])")
1044    test_cpp_expr("A == 1 || defined(B)", "(|| (== (ident A) (int 1)) (defined B))")
1045
1046    test_cpp_expr_optim("0", "(int 0)")
1047    test_cpp_expr_optim("1", "(int 1)")
1048    test_cpp_expr_optim("1 && 1", "(int 1)")
1049    test_cpp_expr_optim("1 && 0", "(int 0)")
1050    test_cpp_expr_optim("0 && 1", "(int 0)")
1051    test_cpp_expr_optim("0 && 0", "(int 0)")
1052    test_cpp_expr_optim("1 || 1", "(int 1)")
1053    test_cpp_expr_optim("1 || 0", "(int 1)")
1054    test_cpp_expr_optim("0 || 1", "(int 1)")
1055    test_cpp_expr_optim("0 || 0", "(int 0)")
1056    test_cpp_expr_optim("A", "(ident A)")
1057    test_cpp_expr_optim("A", "(int 1)", { "A": 1 })
1058    test_cpp_expr_optim("A || B", "(int 1)", { "A": 1 })
1059    test_cpp_expr_optim("A || B", "(int 1)", { "B": 1 })
1060    test_cpp_expr_optim("A && B", "(ident B)", { "A": 1 })
1061    test_cpp_expr_optim("A && B", "(ident A)", { "B": 1 })
1062    test_cpp_expr_optim("A && B", "(&& (ident A) (ident B))")
1063    test_cpp_expr_optim("EXAMPLE", "(ident EXAMPLE)")
1064    test_cpp_expr_optim("EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))")
1065    test_cpp_expr_optim("defined(EXAMPLE)", "(defined EXAMPLE)")
1066    test_cpp_expr_optim("defined(EXAMPLE)", "(defined XOWOE)", { "EXAMPLE": "XOWOE" })
1067    test_cpp_expr_optim("defined(EXAMPLE)", "(int 0)", { "EXAMPLE": kCppUndefinedMacro})
1068    test_cpp_expr_optim("!defined(EXAMPLE)", "(! (defined EXAMPLE))")
1069    test_cpp_expr_optim("!defined(EXAMPLE)", "(! (defined XOWOE))", { "EXAMPLE" : "XOWOE" })
1070    test_cpp_expr_optim("!defined(EXAMPLE)", "(int 1)", { "EXAMPLE" : kCppUndefinedMacro })
1071    test_cpp_expr_optim("defined(A) || defined(B)", "(|| (defined A) (defined B))")
1072    test_cpp_expr_optim("defined(A) || defined(B)", "(int 1)", { "A" : "1" })
1073    test_cpp_expr_optim("defined(A) || defined(B)", "(int 1)", { "B" : "1" })
1074    test_cpp_expr_optim("defined(A) || defined(B)", "(defined A)", { "B" : kCppUndefinedMacro })
1075    test_cpp_expr_optim("defined(A) || defined(B)", "(int 0)", { "A" : kCppUndefinedMacro, "B" : kCppUndefinedMacro })
1076    test_cpp_expr_optim("defined(A) && defined(B)", "(&& (defined A) (defined B))")
1077    test_cpp_expr_optim("defined(A) && defined(B)", "(defined B)", { "A" : "1" })
1078    test_cpp_expr_optim("defined(A) && defined(B)", "(defined A)", { "B" : "1" })
1079    test_cpp_expr_optim("defined(A) && defined(B)", "(int 0)", { "B" : kCppUndefinedMacro })
1080    test_cpp_expr_optim("defined(A) && defined(B)", "(int 0)", { "A" : kCppUndefinedMacro })
1081    test_cpp_expr_optim("A == 1 || defined(B)", "(|| (== (ident A) (int 1)) (defined B))" )
1082    test_cpp_expr_optim("defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)", "(|| (! (defined __GLIBC__)) (< (ident __GLIBC__) (int 2)))", { "__KERNEL__": kCppUndefinedMacro })
1083
1084    test_cpp_expr_source("0", "0")
1085    test_cpp_expr_source("1", "1")
1086    test_cpp_expr_source("1 && 1", "1 && 1")
1087    test_cpp_expr_source("1 && 0", "1 && 0")
1088    test_cpp_expr_source("0 && 1", "0 && 1")
1089    test_cpp_expr_source("0 && 0", "0 && 0")
1090    test_cpp_expr_source("1 || 1", "1 || 1")
1091    test_cpp_expr_source("1 || 0", "1 || 0")
1092    test_cpp_expr_source("0 || 1", "0 || 1")
1093    test_cpp_expr_source("0 || 0", "0 || 0")
1094    test_cpp_expr_source("EXAMPLE", "EXAMPLE")
1095    test_cpp_expr_source("EXAMPLE - 3", "EXAMPLE - 3")
1096    test_cpp_expr_source("defined(EXAMPLE)", "defined(EXAMPLE)")
1097    test_cpp_expr_source("defined EXAMPLE", "defined(EXAMPLE)")
1098    test_cpp_expr_source("A == 1 || defined(B)", "A == 1 || defined(B)")
1099
1100
1101#####################################################################################
1102#####################################################################################
1103#####                                                                           #####
1104#####          C P P   B L O C K                                                #####
1105#####                                                                           #####
1106#####################################################################################
1107#####################################################################################
1108
1109class Block:
1110    """a class used to model a block of input source text. there are two block types:
1111        - directive blocks: contain the tokens of a single pre-processor directive (e.g. #if)
1112        - text blocks, contain the tokens of non-directive blocks
1113
1114       the cpp parser class below will transform an input source file into a list of Block
1115       objects (grouped in a BlockList object for convenience)"""
1116
1117    def __init__(self,tokens,directive=None,lineno=0):
1118        """initialize a new block, if 'directive' is None, this is a text block
1119           NOTE: this automatically converts '#ifdef MACRO' into '#if defined(MACRO)'
1120                 and '#ifndef MACRO' into '#if !defined(MACRO)'"""
1121        if directive == "ifdef":
1122            tok = Token()
1123            tok.set(tokDEFINED)
1124            tokens = [ tok ] + tokens
1125            directive = "if"
1126
1127        elif directive == "ifndef":
1128            tok1 = Token()
1129            tok2 = Token()
1130            tok1.set(tokNOT)
1131            tok2.set(tokDEFINED)
1132            tokens = [ tok1, tok2 ] + tokens
1133            directive = "if"
1134
1135        self.tokens    = tokens
1136        self.directive = directive
1137        if lineno > 0:
1138            self.lineno = lineno
1139        else:
1140            self.lineno = self.tokens[0].lineno
1141
1142        if self.isIf():
1143            self.expr = CppExpr( self.tokens )
1144
1145    def isDirective(self):
1146        """returns True iff this is a directive block"""
1147        return self.directive != None
1148
1149    def isConditional(self):
1150        """returns True iff this is a conditional directive block"""
1151        return self.directive in ["if","ifdef","ifndef","else","elif","endif"]
1152
1153    def isDefine(self):
1154        """returns the macro name in a #define directive, or None otherwise"""
1155        if self.directive != "define":
1156            return None
1157
1158        return self.tokens[0].value
1159
1160    def isIf(self):
1161        """returns True iff this is an #if-like directive block"""
1162        return self.directive in ["if","ifdef","ifndef","elif"]
1163
1164    def isInclude(self):
1165        """checks wether this is a #include directive. if true, then returns the
1166           corresponding file name (with brackets or double-qoutes). None otherwise"""
1167        if self.directive != "include":
1168            return None
1169
1170        if self.tokens[0].id == tokSTRING:
1171            # a double-quote include, that's easy
1172            return self.tokens[0].value
1173
1174        # we only want the bracket part, not any comments or junk after it
1175        if self.tokens[0].id == "<":
1176            i   = 0
1177            tok = self.tokens
1178            n   = len(tok)
1179            while i < n and tok[i].id != ">":
1180                i += 1
1181
1182            if i >= n:
1183                return None
1184
1185            return string.join([ str(x) for x in tok[:i+1] ],"")
1186
1187        else:
1188            return None
1189
1190    def removeWhiteSpace(self):
1191        # Remove trailing whitespace and empty lines
1192        # All whitespace is also contracted to a single space
1193        if self.directive != None:
1194            return
1195
1196        tokens = []
1197        line   = 0     # index of line start
1198        space  = -1    # index of first space, or -1
1199        ii = 0
1200        nn = len(self.tokens)
1201        while ii < nn:
1202            tok = self.tokens[ii]
1203
1204            # If we find a space, record its position if this is the first
1205            # one the line start or the previous character. Don't append
1206            # anything to tokens array yet though.
1207            if tok.id == tokSPACE:
1208                if space < 0:
1209                    space = ii
1210                ii += 1
1211                continue
1212
1213            # If this is a line space, ignore the spaces we found previously
1214            # on the line, and remove empty lines.
1215            if tok.id == tokLN:
1216                old_line  = line
1217                old_space = space
1218                ii   += 1
1219                line  = ii
1220                space = -1
1221                if old_space == old_line:  # line only contains spaces
1222                    continue
1223                if ii-1 == old_line:  # line is empty
1224                    continue
1225                tokens.append(tok)
1226                continue
1227
1228            # Other token, append any space range if any, converting each
1229            # one to a single space character, then append the token.
1230            if space >= 0:
1231                jj = space
1232                space = -1
1233                while jj < ii:
1234                    tok2 = self.tokens[jj]
1235                    tok2.value = " "
1236                    tokens.append(tok2)
1237                    jj += 1
1238
1239            tokens.append(tok)
1240            ii += 1
1241
1242        self.tokens = tokens
1243
1244    def writeWithWarning(self,out,warning,left_count,repeat_count):
1245        # removeWhiteSpace() will sometimes creates non-directive blocks
1246        # without any tokens. These come from blocks that only contained
1247        # empty lines and spaces. They should not be printed in the final
1248        # output, and then should not be counted for this operation.
1249        #
1250        if not self.directive and self.tokens == []:
1251            return left_count
1252
1253        if self.directive:
1254            out.write(str(self).rstrip() + "\n")
1255            left_count -= 1
1256            if left_count == 0:
1257                out.write(warning)
1258                left_count = repeat_count
1259
1260        else:
1261            for tok in self.tokens:
1262                out.write(str(tok))
1263                if tok.id == tokLN:
1264                    left_count -= 1
1265                    if left_count == 0:
1266                        out.write(warning)
1267                        left_count = repeat_count
1268
1269        return left_count
1270
1271
1272    def __repr__(self):
1273        """generate the representation of a given block"""
1274        if self.directive:
1275            result = "#%s " % self.directive
1276            if self.isIf():
1277                result += repr(self.expr)
1278            else:
1279                for tok in self.tokens:
1280                    result += repr(tok)
1281        else:
1282            result = ""
1283            for tok in self.tokens:
1284                result += repr(tok)
1285
1286        return result
1287
1288    def __str__(self):
1289        """generate the string representation of a given block"""
1290        if self.directive:
1291            if self.directive == "if":
1292                # small optimization to re-generate #ifdef and #ifndef
1293                e = self.expr.expr
1294                op = e[0]
1295                if op == "defined":
1296                    result = "#ifdef %s" % e[1]
1297                elif op == "!" and e[1][0] == "defined":
1298                    result = "#ifndef %s" % e[1][1]
1299                else:
1300                    result = "#if " + str(self.expr)
1301            else:
1302                result = "#%s" % self.directive
1303                if len(self.tokens):
1304                    result += " "
1305                for tok in self.tokens:
1306                    result += str(tok)
1307        else:
1308            result = ""
1309            for tok in self.tokens:
1310                result += str(tok)
1311
1312        return result
1313
1314class BlockList:
1315    """a convenience class used to hold and process a list of blocks returned by
1316       the cpp parser"""
1317    def __init__(self,blocks):
1318        self.blocks = blocks
1319
1320    def __len__(self):
1321        return len(self.blocks)
1322
1323    def __getitem__(self,n):
1324        return self.blocks[n]
1325
1326    def __repr__(self):
1327        return repr(self.blocks)
1328
1329    def __str__(self):
1330        result = ""
1331        for b in self.blocks:
1332            result += str(b)
1333            if b.isDirective():
1334                result = result.rstrip() + '\n'
1335        return result
1336
1337    def  optimizeIf01(self):
1338        """remove the code between #if 0 .. #endif in a BlockList"""
1339        self.blocks = optimize_if01(self.blocks)
1340
1341    def optimizeMacros(self, macros):
1342        """remove known defined and undefined macros from a BlockList"""
1343        for b in self.blocks:
1344            if b.isIf():
1345                b.expr.optimize(macros)
1346
1347    def removeMacroDefines(self,macros):
1348        """remove known macro definitions from a BlockList"""
1349        self.blocks = remove_macro_defines(self.blocks,macros)
1350
1351    def removeWhiteSpace(self):
1352        for b in self.blocks:
1353            b.removeWhiteSpace()
1354
1355    def optimizeAll(self,macros):
1356        self.optimizeMacros(macros)
1357        self.optimizeIf01()
1358        return
1359
1360    def findIncludes(self):
1361        """return the list of included files in a BlockList"""
1362        result = []
1363        for b in self.blocks:
1364            i = b.isInclude()
1365            if i:
1366                result.append(i)
1367
1368        return result
1369
1370
1371    def write(self,out):
1372        out.write(str(self))
1373
1374    def writeWithWarning(self,out,warning,repeat_count):
1375        left_count = repeat_count
1376        for b in self.blocks:
1377            left_count = b.writeWithWarning(out,warning,left_count,repeat_count)
1378
1379    def removeComments(self):
1380        for b in self.blocks:
1381            for tok in b.tokens:
1382                if tok.id == tokSPACE:
1383                    tok.value = " "
1384
1385    def removeVarsAndFuncs(self,knownStatics=set()):
1386        """remove all extern and static declarations corresponding
1387           to variable and function declarations. we only accept typedefs
1388           and enum/structs/union declarations.
1389
1390           however, we keep the definitions corresponding to the set
1391           of known static inline functions in the set 'knownStatics',
1392           which is useful for optimized byteorder swap functions and
1393           stuff like that.
1394           """
1395        # state = 0 => normal (i.e. LN + spaces)
1396        # state = 1 => typedef/struct encountered, ends with ";"
1397        # state = 2 => var declaration encountered, ends with ";"
1398        # state = 3 => func declaration encountered, ends with "}"
1399        state      = 0
1400        depth      = 0
1401        blocks2    = []
1402        skipTokens = False
1403        for b in self.blocks:
1404            if b.isDirective():
1405                blocks2.append(b)
1406            else:
1407                n     = len(b.tokens)
1408                i     = 0
1409                if skipTokens:
1410                    first = n
1411                else:
1412                    first = 0
1413                while i < n:
1414                    tok = b.tokens[i]
1415                    tokid = tok.id
1416                    # If we are not looking for the start of a new
1417                    # type/var/func, then skip over tokens until
1418                    # we find our terminator, managing the depth of
1419                    # accolades as we go.
1420                    if state > 0:
1421                        terminator = False
1422                        if tokid == '{':
1423                            depth += 1
1424                        elif tokid == '}':
1425                            if depth > 0:
1426                                depth -= 1
1427                            if (depth == 0) and (state == 3):
1428                                terminator = True
1429                        elif tokid == ';' and depth == 0:
1430                            terminator = True
1431
1432                        if terminator:
1433                            # we found the terminator
1434                            state = 0
1435                            if skipTokens:
1436                                skipTokens = False
1437                                first = i+1
1438
1439                        i = i+1
1440                        continue
1441
1442                    # We are looking for the start of a new type/func/var
1443                    # ignore whitespace
1444                    if tokid in [tokLN, tokSPACE]:
1445                        i = i+1
1446                        continue
1447
1448                    # Is it a new type definition, then start recording it
1449                    if tok.value in [ 'struct', 'typedef', 'enum', 'union', '__extension__' ]:
1450                        state = 1
1451                        i     = i+1
1452                        continue
1453
1454                    # Is it a variable or function definition. If so, first
1455                    # try to determine which type it is, and also extract
1456                    # its name.
1457                    #
1458                    # We're going to parse the next tokens of the same block
1459                    # until we find a semi-column or a left parenthesis.
1460                    #
1461                    # The semi-column corresponds to a variable definition,
1462                    # the left-parenthesis to a function definition.
1463                    #
1464                    # We also assume that the var/func name is the last
1465                    # identifier before the terminator.
1466                    #
1467                    j = i+1
1468                    ident = ""
1469                    while j < n:
1470                        tokid = b.tokens[j].id
1471                        if tokid == '(':  # a function declaration
1472                            state = 3
1473                            break
1474                        elif tokid == ';': # a variable declaration
1475                            state = 2
1476                            break
1477                        if tokid == tokIDENT:
1478                            ident = b.tokens[j].value
1479                        j += 1
1480
1481                    if j >= n:
1482                        # This can only happen when the declaration
1483                        # does not end on the current block (e.g. with
1484                        # a directive mixed inside it.
1485                        #
1486                        # We will treat it as malformed because
1487                        # it's very hard to recover from this case
1488                        # without making our parser much more
1489                        # complex.
1490                        #
1491                        #print "### skip unterminated static '%s'" % ident
1492                        break
1493
1494                    if ident in knownStatics:
1495                        #print "### keep var/func '%s': %s" % (ident,repr(b.tokens[i:j]))
1496                        pass
1497                    else:
1498                        # We're going to skip the tokens for this declaration
1499                        #print "### skip variable /func'%s': %s" % (ident,repr(b.tokens[i:j]))
1500                        if i > first:
1501                            blocks2.append( Block(b.tokens[first:i]))
1502                        skipTokens = True
1503                        first      = n
1504
1505                    i = i+1
1506
1507                if i > first:
1508                    #print "### final '%s'" % repr(b.tokens[first:i])
1509                    blocks2.append( Block(b.tokens[first:i]) )
1510
1511        self.blocks = blocks2
1512
1513    def insertDisclaimer(self,disclaimer="/* auto-generated file, DO NOT EDIT */"):
1514        """insert your standard issue disclaimer that this is an
1515           auto-generated file, etc.."""
1516        tokens = CppLineTokenizer( disclaimer ).toTokenList()
1517        tokens = tokens[:-1]  # remove trailing tokLN
1518        self.blocks = [ Block(tokens) ] + self.blocks
1519
1520    def replaceTokens(self,replacements=dict()):
1521        """replace tokens according to the given dict
1522           """
1523        for b in self.blocks:
1524            if not b.isDirective():
1525                for tok in b.tokens:
1526                    if tok.id == tokIDENT:
1527                        if tok.value in replacements:
1528                            tok.value = replacements[tok.value]
1529
1530class BlockParser:
1531    """a class used to convert an input source file into a BlockList object"""
1532
1533    def __init__(self,tokzer=None):
1534        """initialize a block parser. the input source is provided through a Tokenizer
1535           object"""
1536        self.reset(tokzer)
1537
1538    def reset(self,tokzer):
1539        self.state  = 1
1540        self.tokzer = tokzer
1541
1542    def getBlocks(self,tokzer=None):
1543        """tokenize and parse the input source, return a BlockList object
1544           NOTE: empty and line-numbering directives are ignored and removed
1545                 from the result. as a consequence, it is possible to have
1546                 two successive text blocks in the result"""
1547        # state 0 => in source code
1548        # state 1 => in source code, after a LN
1549        # state 2 => in source code, after LN then some space
1550        state   = 1
1551        lastLN  = 0
1552        current = []
1553        blocks  = []
1554
1555        if tokzer == None:
1556            tokzer = self.tokzer
1557
1558        while 1:
1559            tok = tokzer.getToken()
1560            if tok.id == tokEOF:
1561                break
1562
1563            if tok.id == tokLN:
1564                state    = 1
1565                current.append(tok)
1566                lastLN   = len(current)
1567
1568            elif tok.id == tokSPACE:
1569                if state == 1:
1570                    state = 2
1571                current.append(tok)
1572
1573            elif tok.id == "#":
1574                if state > 0:
1575                    # this is the start of a directive
1576
1577                    if lastLN > 0:
1578                        # record previous tokens as text block
1579                        block   = Block(current[:lastLN])
1580                        blocks.append(block)
1581                        lastLN  = 0
1582
1583                    current = []
1584
1585                    # skip spaces after the #
1586                    while 1:
1587                        tok = tokzer.getToken()
1588                        if tok.id != tokSPACE:
1589                            break
1590
1591                    if tok.id != tokIDENT:
1592                        # empty or line-numbering, ignore it
1593                        if tok.id != tokLN and tok.id != tokEOF:
1594                            while 1:
1595                                tok = tokzer.getToken()
1596                                if tok.id == tokLN or tok.id == tokEOF:
1597                                    break
1598                        continue
1599
1600                    directive = tok.value
1601                    lineno    = tok.lineno
1602
1603                    # skip spaces
1604                    tok = tokzer.getToken()
1605                    while tok.id == tokSPACE:
1606                        tok = tokzer.getToken()
1607
1608                    # then record tokens until LN
1609                    dirtokens = []
1610                    while tok.id != tokLN and tok.id != tokEOF:
1611                        dirtokens.append(tok)
1612                        tok = tokzer.getToken()
1613
1614                    block = Block(dirtokens,directive,lineno)
1615                    blocks.append(block)
1616                    state   = 1
1617
1618            else:
1619                state = 0
1620                current.append(tok)
1621
1622        if len(current) > 0:
1623            block = Block(current)
1624            blocks.append(block)
1625
1626        return BlockList(blocks)
1627
1628    def parse(self,tokzer):
1629        return self.getBlocks( tokzer )
1630
1631    def parseLines(self,lines):
1632        """parse a list of text lines into a BlockList object"""
1633        return self.getBlocks( CppLinesTokenizer(lines) )
1634
1635    def parseFile(self,path):
1636        """parse a file into a BlockList object"""
1637        file = open(path, "rt")
1638        result = self.getBlocks( CppFileTokenizer(file) )
1639        file.close()
1640        return result
1641
1642
1643def test_block_parsing(lines,expected):
1644    blocks = BlockParser().parse( CppLinesTokenizer(lines) )
1645    if len(blocks) != len(expected):
1646        raise BadExpectedToken, "parser.buildBlocks returned '%s' expecting '%s'" \
1647              % (str(blocks), repr(expected))
1648    for n in range(len(blocks)):
1649        if str(blocks[n]) != expected[n]:
1650            raise BadExpectedToken, "parser.buildBlocks()[%d] is '%s', expecting '%s'" \
1651                  % (n, str(blocks[n]), expected[n])
1652    #for block in blocks:
1653    #    print block
1654
1655def test_BlockParser():
1656    test_block_parsing(["#error hello"],["#error hello"])
1657    test_block_parsing([ "foo", "", "bar" ], [ "foo\n\nbar\n" ])
1658    test_block_parsing([ "foo", "  #  ", "bar" ], [ "foo\n","bar\n" ])
1659    test_block_parsing(\
1660        [ "foo", "   #  ", "  #  /* ahah */ if defined(__KERNEL__) ", "bar", "#endif" ],
1661        [ "foo\n", "#ifdef __KERNEL__", "bar\n", "#endif" ] )
1662
1663
1664#####################################################################################
1665#####################################################################################
1666#####                                                                           #####
1667#####        B L O C K   L I S T   O P T I M I Z A T I O N                      #####
1668#####                                                                           #####
1669#####################################################################################
1670#####################################################################################
1671
1672def  remove_macro_defines( blocks, excludedMacros=set() ):
1673    """remove macro definitions like #define <macroName>  ...."""
1674    result = []
1675    for b in blocks:
1676        macroName = b.isDefine()
1677        if macroName == None or not macroName in excludedMacros:
1678            result.append(b)
1679
1680    return result
1681
1682def  find_matching_endif( blocks, i ):
1683    n     = len(blocks)
1684    depth = 1
1685    while i < n:
1686        if blocks[i].isDirective():
1687            dir = blocks[i].directive
1688            if dir in [ "if", "ifndef", "ifdef" ]:
1689                depth += 1
1690            elif depth == 1 and dir in [ "else", "elif" ]:
1691                return i
1692            elif dir == "endif":
1693                depth -= 1
1694                if depth == 0:
1695                    return i
1696        i += 1
1697    return i
1698
1699def  optimize_if01( blocks ):
1700    """remove the code between #if 0 .. #endif in a list of CppBlocks"""
1701    i = 0
1702    n = len(blocks)
1703    result = []
1704    while i < n:
1705        j = i
1706        while j < n and not blocks[j].isIf():
1707            j += 1
1708        if j > i:
1709            D2("appending lines %d to %d" % (blocks[i].lineno, blocks[j-1].lineno))
1710            result += blocks[i:j]
1711        if j >= n:
1712            break
1713        expr = blocks[j].expr
1714        r    = expr.toInt()
1715        if r == None:
1716            result.append(blocks[j])
1717            i = j + 1
1718            continue
1719
1720        if r == 0:
1721            # if 0 => skip everything until the corresponding #endif
1722            j = find_matching_endif( blocks, j+1 )
1723            if j >= n:
1724                # unterminated #if 0, finish here
1725                break
1726            dir = blocks[j].directive
1727            if dir == "endif":
1728                D2("remove 'if 0' .. 'endif' (lines %d to %d)" % (blocks[i].lineno, blocks[j].lineno))
1729                i = j + 1
1730            elif dir == "else":
1731                # convert 'else' into 'if 1'
1732                D2("convert 'if 0' .. 'else' into 'if 1' (lines %d to %d)" % (blocks[i].lineno, blocks[j-1].lineno))
1733                blocks[j].directive = "if"
1734                blocks[j].expr      = CppExpr( CppLineTokenizer("1").toTokenList() )
1735                i = j
1736            elif dir == "elif":
1737                # convert 'elif' into 'if'
1738                D2("convert 'if 0' .. 'elif' into 'if'")
1739                blocks[j].directive = "if"
1740                i = j
1741            continue
1742
1743        # if 1 => find corresponding endif and remove/transform them
1744        k = find_matching_endif( blocks, j+1 )
1745        if k >= n:
1746            # unterminated #if 1, finish here
1747            D2("unterminated 'if 1'")
1748            result += blocks[j+1:k]
1749            break
1750
1751        dir = blocks[k].directive
1752        if dir == "endif":
1753            D2("convert 'if 1' .. 'endif' (lines %d to %d)"  % (blocks[j].lineno, blocks[k].lineno))
1754            result += optimize_if01(blocks[j+1:k])
1755            i       = k+1
1756        elif dir == "else":
1757            # convert 'else' into 'if 0'
1758            D2("convert 'if 1' .. 'else' (lines %d to %d)"  % (blocks[j].lineno, blocks[k].lineno))
1759            result += optimize_if01(blocks[j+1:k])
1760            blocks[k].directive = "if"
1761            blocks[k].expr      = CppExpr( CppLineTokenizer("0").toTokenList() )
1762            i = k
1763        elif dir == "elif":
1764            # convert 'elif' into 'if 0'
1765            D2("convert 'if 1' .. 'elif' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno))
1766            result += optimize_if01(blocks[j+1:k])
1767            blocks[k].expr      = CppExpr( CppLineTokenizer("0").toTokenList() )
1768            i = k
1769    return result
1770
1771def  test_optimizeAll():
1772    text = """\
1773#if 1
1774#define  GOOD_1
1775#endif
1776#if 0
1777#define  BAD_2
1778#define  BAD_3
1779#endif
1780
1781#if 1
1782#define  GOOD_2
1783#else
1784#define  BAD_4
1785#endif
1786
1787#if 0
1788#define  BAD_5
1789#else
1790#define  GOOD_3
1791#endif
1792
1793#if defined(__KERNEL__)
1794#define BAD_KERNEL
1795#endif
1796
1797#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)
1798#define X
1799#endif
1800
1801#if 0
1802#if 1
1803#define  BAD_6
1804#endif
1805#endif\
1806"""
1807
1808    expected = """\
1809#define GOOD_1
1810
1811#define GOOD_2
1812
1813#define GOOD_3
1814
1815
1816#if !defined(__GLIBC__) || __GLIBC__ < 2
1817#define X
1818#endif
1819
1820"""
1821
1822    out = StringOutput()
1823    lines = string.split(text, '\n')
1824    list = BlockParser().parse( CppLinesTokenizer(lines) )
1825    #D_setlevel(2)
1826    list.optimizeAll( {"__KERNEL__":kCppUndefinedMacro} )
1827    list.write(out)
1828    if out.get() != expected:
1829        print "[FAIL]: macro optimization failed\n"
1830        print "<<<< expecting '",
1831        print expected,
1832        print "'\n>>>> result '"
1833        print out.get(),
1834        print "'\n----"
1835        global failure_count
1836        failure_count += 1
1837
1838
1839# -- Always run the unit tests.
1840
1841def runUnitTests():
1842    """run all unit tests for this program"""
1843    test_CppTokenizer()
1844    test_CppExpr()
1845    test_optimizeAll()
1846    test_BlockParser()
1847
1848failure_count = 0
1849runUnitTests()
1850if failure_count != 0:
1851    sys.exit(1)
1852