cpp.py revision c95eb57405d3d2f0e6cfab313aa74b1bad280452
1# a glorified C pre-processor parser
2
3import sys, re, string
4from utils import *
5from defaults import *
6
7debugTokens             = False
8debugDirectiveTokenizer = False
9debugLineParsing        = False
10debugCppExpr            = False
11debugOptimIf01          = False
12
13#####################################################################################
14#####################################################################################
15#####                                                                           #####
16#####           C P P   T O K E N S                                             #####
17#####                                                                           #####
18#####################################################################################
19#####################################################################################
20
21# the list of supported C-preprocessor tokens
22# plus a couple of C tokens as well
23tokEOF       = "\0"
24tokLN        = "\n"
25tokSTRINGIFY = "#"
26tokCONCAT    = "##"
27tokLOGICAND  = "&&"
28tokLOGICOR   = "||"
29tokSHL       = "<<"
30tokSHR       = ">>"
31tokEQUAL     = "=="
32tokNEQUAL    = "!="
33tokLT        = "<"
34tokLTE       = "<="
35tokGT        = ">"
36tokGTE       = ">="
37tokELLIPSIS  = "..."
38tokSPACE     = " "
39tokDEFINED   = "defined"
40tokLPAREN    = "("
41tokRPAREN    = ")"
42tokNOT       = "!"
43tokPLUS      = "+"
44tokMINUS     = "-"
45tokMULTIPLY  = "*"
46tokDIVIDE    = "/"
47tokMODULUS   = "%"
48tokBINAND    = "&"
49tokBINOR     = "|"
50tokBINXOR    = "^"
51tokCOMMA     = ","
52tokLBRACE    = "{"
53tokRBRACE    = "}"
54tokARROW     = "->"
55tokINCREMENT = "++"
56tokDECREMENT = "--"
57tokNUMBER    = "<number>"
58tokIDENT     = "<ident>"
59tokSTRING    = "<string>"
60
61class Token:
62    """a simple class to hold information about a given token.
63       each token has a position in the source code, as well as
64       an 'id' and a 'value'. the id is a string that identifies
65       the token's class, while the value is the string of the
66       original token itself.
67
68       for example, the tokenizer concatenates a series of spaces
69       and tabs as a single tokSPACE id, whose value if the original
70       spaces+tabs sequence."""
71
72    def __init__(self):
73        self.id     = None
74        self.value  = None
75        self.lineno = 0
76        self.colno  = 0
77
78    def set(self,id,val=None):
79        self.id = id
80        if val:
81            self.value = val
82        else:
83            self.value = id
84        return None
85
86    def copyFrom(self,src):
87        self.id     = src.id
88        self.value  = src.value
89        self.lineno = src.lineno
90        self.colno  = src.colno
91
92    def __repr__(self):
93        if self.id == tokIDENT:
94            return "(ident %s)" % self.value
95        if self.id == tokNUMBER:
96            return "(number %s)" % self.value
97        if self.id == tokSTRING:
98            return "(string '%s')" % self.value
99        if self.id == tokLN:
100            return "<LN>"
101        if self.id == tokEOF:
102            return "<EOF>"
103        if self.id == tokSPACE and self.value == "\\":
104            # this corresponds to a trailing \ that was transformed into a tokSPACE
105            return "<\\>"
106
107        return self.id
108
109    def __str__(self):
110        if self.id == tokIDENT:
111            return self.value
112        if self.id == tokNUMBER:
113            return self.value
114        if self.id == tokSTRING:
115            return self.value
116        if self.id == tokEOF:
117            return "<EOF>"
118        if self.id == tokSPACE:
119            if self.value == "\\":  # trailing \
120                return "\\\n"
121            else:
122                return self.value
123
124        return self.id
125
126class BadExpectedToken(Exception):
127    def __init__(self,msg):
128        print msg
129
130#####################################################################################
131#####################################################################################
132#####                                                                           #####
133#####          C P P   T O K E N   C U R S O R                                  #####
134#####                                                                           #####
135#####################################################################################
136#####################################################################################
137
138class TokenCursor:
139    """a small class to iterate over a list of Token objects"""
140    def __init__(self,tokens):
141        self.tokens = tokens
142        self.n      = 0
143        self.count  = len(tokens)
144
145    def set(self,n):
146        """set the current position"""
147        if n < 0:
148            n = 0
149        if n > self.count:
150            n = self.count
151        self.n = n
152
153    def peekId(self):
154        """retrieve the id of the current token"""
155        if (self.n >= self.count):
156            return None
157        return self.tokens[self.n].id
158
159    def peek(self):
160        """retrieve the current token. does not change position"""
161        if (self.n >= self.count):
162            return None
163        return self.tokens[self.n]
164
165    def skip(self):
166        """increase current token position"""
167        if (self.n < self.count):
168            self.n += 1
169
170    def skipSpaces(self):
171        """skip over all space tokens, this includes tokSPACE and tokLN"""
172        while 1:
173            tok = self.peekId()
174            if tok != tokSPACE and tok != tokLN:
175                break
176            self.skip()
177
178    def skipIfId(self,id):
179        """skip an optional token"""
180        if self.peekId() == id:
181            self.skip()
182
183    def expectId(self,id):
184        """raise an exception if the current token hasn't a given id.
185           otherwise skip over it"""
186        tok = self.peek()
187        if tok.id != id:
188            raise BadExpectedToken, "%d:%d: '%s' expected, received '%s'" % (tok.lineno, tok.colno, id, tok.id)
189        self.skip()
190
191    def remain(self):
192        """return the list of remaining tokens"""
193        return self.tokens[self.n:]
194
195
196#####################################################################################
197#####################################################################################
198#####                                                                           #####
199#####           C P P   T O K E N I Z E R                                       #####
200#####                                                                           #####
201#####################################################################################
202#####################################################################################
203
204# list of long symbols, i.e. those that take more than one characters
205cppLongSymbols = [ tokCONCAT, tokLOGICAND, tokLOGICOR, tokSHL, tokSHR, tokELLIPSIS, tokEQUAL,\
206                   tokNEQUAL, tokLTE, tokGTE, tokARROW, tokINCREMENT, tokDECREMENT ]
207
208class CppTokenizer:
209    """an abstract class used to convert some input text into a list
210       of tokens. real implementations follow and differ in the format
211       of the input text only"""
212
213    def __init__(self):
214        """initialize a new CppTokenizer object"""
215        self.eof  = False  # end of file reached ?
216        self.text = None   # content of current line, with final \n stripped
217        self.line = 0      # number of current line
218        self.pos  = 0      # current character position in current line
219        self.len  = 0      # length of current line text
220        self.held = Token()
221
222    def setLineText(self,line):
223        """set the content of the (next) current line. should be called
224           by fillLineText() in derived classes"""
225        self.text = line
226        self.len  = len(line)
227        self.pos  = 0
228
229    def fillLineText(self):
230        """refresh the content of 'line' with a new line of input"""
231        # to be overriden
232        self.eof = True
233
234    def markPos(self,tok):
235        """mark the position of the current token in the source file"""
236        if self.eof or self.pos > self.len:
237            tok.lineno = self.line + 1
238            tok.colno  = 0
239        else:
240            tok.lineno = self.line
241            tok.colno  = self.pos
242
243    def peekChar(self):
244        """return the current token under the cursor without moving it"""
245        if self.eof:
246            return tokEOF
247
248        if self.pos > self.len:
249            self.pos   = 0
250            self.line += 1
251            self.fillLineText()
252            if self.eof:
253                return tokEOF
254
255        if self.pos == self.len:
256            return tokLN
257        else:
258            return self.text[self.pos]
259
260    def peekNChar(self,n):
261        """try to peek the next n chars on the same line"""
262        if self.pos + n > self.len:
263            return None
264        return self.text[self.pos:self.pos+n]
265
266    def skipChar(self):
267        """increment the token cursor position"""
268        if not self.eof:
269            self.pos += 1
270
271    def skipNChars(self,n):
272        if self.pos + n <= self.len:
273            self.pos += n
274        else:
275            while n > 0:
276                self.skipChar()
277                n -= 1
278
279    def nextChar(self):
280        """retrieve the token at the current cursor position, then skip it"""
281        result = self.peekChar()
282        self.skipChar()
283        return  result
284
285    def getEscape(self):
286        # try to get all characters after a backslash (\)
287        result = self.nextChar()
288        if result == "0":
289            # octal number ?
290            num = self.peekNChar(3)
291            if num != None:
292                isOctal = True
293                for d in num:
294                    if not d in "01234567":
295                        isOctal = False
296                        break
297                if isOctal:
298                    result += num
299                    self.skipNChars(3)
300        elif result == "x" or result == "X":
301            # hex number ?
302            num = self.peekNChar(2)
303            if num != None:
304                isHex = True
305                for d in num:
306                    if not d in "012345678abcdefABCDEF":
307                        isHex = False
308                        break
309                if isHex:
310                    result += num
311                    self.skipNChars(2)
312        elif result == "u" or result == "U":
313            # unicode char ?
314            num = self.peekNChar(4)
315            if num != None:
316                isHex = True
317                for d in num:
318                    if not d in "012345678abcdefABCDEF":
319                        isHex = False
320                        break
321                if isHex:
322                    result += num
323                    self.skipNChars(4)
324
325        return result
326
327    def nextRealToken(self,tok):
328        """return next CPP token, used internally by nextToken()"""
329        c = self.nextChar()
330        if c == tokEOF or c == tokLN:
331            return tok.set(c)
332
333        if c == '/':
334            c = self.peekChar()
335            if c == '/':   # C++ comment line
336                self.skipChar()
337                while 1:
338                    c = self.nextChar()
339                    if c == tokEOF or c == tokLN:
340                        break
341                return tok.set(tokLN)
342            if c == '*':   # C comment start
343                self.skipChar()
344                value = "/*"
345                prev_c = None
346                while 1:
347                    c = self.nextChar()
348                    if c == tokEOF:
349                        #print "## EOF after '%s'" % value
350                        return tok.set(tokEOF,value)
351                    if c == '/' and prev_c == '*':
352                        break
353                    prev_c = c
354                    value += c
355
356                value += "/"
357                #print "## COMMENT: '%s'" % value
358                return tok.set(tokSPACE,value)
359            c = '/'
360
361        if c.isspace():
362            while 1:
363                c2 = self.peekChar()
364                if c2 == tokLN or not c2.isspace():
365                    break
366                c += c2
367                self.skipChar()
368            return tok.set(tokSPACE,c)
369
370        if c == '\\':
371            if debugTokens:
372                print "nextRealToken: \\ found, next token is '%s'" % repr(self.peekChar())
373            if self.peekChar() == tokLN:   # trailing \
374                # eat the tokLN
375                self.skipChar()
376                # we replace a trailing \ by a tokSPACE whose value is
377                # simply "\\". this allows us to detect them later when
378                # needed.
379                return tok.set(tokSPACE,"\\")
380            else:
381                # treat as a single token here ?
382                c +=self.getEscape()
383                return tok.set(c)
384
385        if c == "'":  # chars
386            c2 = self.nextChar()
387            c += c2
388            if c2 == '\\':
389                c += self.getEscape()
390
391            while 1:
392                c2 = self.nextChar()
393                if c2 == tokEOF:
394                    break
395                c += c2
396                if c2 == "'":
397                    break
398
399            return tok.set(tokSTRING, c)
400
401        if c == '"':  # strings
402            quote = 0
403            while 1:
404                c2  = self.nextChar()
405                if c2 == tokEOF:
406                    return tok.set(tokSTRING,c)
407
408                c += c2
409                if not quote:
410                    if c2 == '"':
411                        return tok.set(tokSTRING,c)
412                    if c2 == "\\":
413                        quote = 1
414                else:
415                    quote = 0
416
417        if c >= "0" and c <= "9":  # integers ?
418            while 1:
419                c2 = self.peekChar()
420                if c2 == tokLN or (not c2.isalnum() and c2 != "_"):
421                    break
422                c += c2
423                self.skipChar()
424            return tok.set(tokNUMBER,c)
425
426        if c.isalnum() or c == "_":  # identifiers ?
427            while 1:
428                c2 = self.peekChar()
429                if c2 == tokLN or (not c2.isalnum() and c2 != "_"):
430                    break
431                c += c2
432                self.skipChar()
433            if c == tokDEFINED:
434                return tok.set(tokDEFINED)
435            else:
436                return tok.set(tokIDENT,c)
437
438        # check special symbols
439        for sk in cppLongSymbols:
440            if c == sk[0]:
441                sklen = len(sk[1:])
442                if self.pos + sklen <= self.len and \
443                   self.text[self.pos:self.pos+sklen] == sk[1:]:
444                    self.pos += sklen
445                    return tok.set(sk)
446
447        return tok.set(c)
448
449    def nextToken(self,tok):
450        """return the next token from the input text. this function
451           really updates 'tok', and does not return a new one"""
452        self.markPos(tok)
453        self.nextRealToken(tok)
454
455    def getToken(self):
456        tok = Token()
457        self.nextToken(tok)
458        if debugTokens:
459            print "getTokens: %s" % repr(tok)
460        return tok
461
462    def toTokenList(self):
463        """convert the input text of a CppTokenizer into a direct
464           list of token objects. tokEOF is stripped from the result"""
465        result = []
466        while 1:
467            tok = Token()
468            self.nextToken(tok)
469            if tok.id == tokEOF:
470                break
471            result.append(tok)
472        return result
473
474class CppLineTokenizer(CppTokenizer):
475    """a CppTokenizer derived class that accepts a single line of text as input"""
476    def __init__(self,line,lineno=1):
477        CppTokenizer.__init__(self)
478        self.line = lineno
479        self.setLineText(line)
480
481
482class CppLinesTokenizer(CppTokenizer):
483    """a CppTokenizer derived class that accepts a list of texdt lines as input.
484       the lines must not have a trailing \n"""
485    def __init__(self,lines=[],lineno=1):
486        """initialize a CppLinesTokenizer. you can later add lines using addLines()"""
487        CppTokenizer.__init__(self)
488        self.line  = lineno
489        self.lines = lines
490        self.index = 0
491        self.count = len(lines)
492
493        if self.count > 0:
494            self.fillLineText()
495        else:
496            self.eof = True
497
498    def addLine(self,line):
499        """add a line to a CppLinesTokenizer. this can be done after tokenization
500           happens"""
501        if self.count == 0:
502            self.setLineText(line)
503            self.index = 1
504        self.lines.append(line)
505        self.count += 1
506        self.eof    = False
507
508    def fillLineText(self):
509        if self.index < self.count:
510            self.setLineText(self.lines[self.index])
511            self.index += 1
512        else:
513            self.eof = True
514
515
516class CppFileTokenizer(CppTokenizer):
517    def __init__(self,file,lineno=1):
518        CppTokenizer.__init__(self)
519        self.file = file
520        self.line = lineno
521
522    def fillLineText(self):
523        line = self.file.readline()
524        if len(line) > 0:
525            if line[-1] == '\n':
526                line = line[:-1]
527            if len(line) > 0 and line[-1] == "\r":
528                line = line[:-1]
529            self.setLineText(line)
530        else:
531            self.eof = True
532
533# Unit testing
534#
535class CppTokenizerTester:
536    """a class used to test CppTokenizer classes"""
537    def __init__(self,tokenizer=None):
538        self.tokenizer = tokenizer
539        self.token     = Token()
540
541    def setTokenizer(self,tokenizer):
542        self.tokenizer = tokenizer
543
544    def expect(self,id):
545        self.tokenizer.nextToken(self.token)
546        tokid = self.token.id
547        if tokid == id:
548            return
549        if self.token.value == id and (tokid == tokIDENT or tokid == tokNUMBER):
550            return
551        raise BadExpectedToken, "###  BAD TOKEN: '%s' expecting '%s'" % (self.token.id,id)
552
553    def expectToken(self,id,line,col):
554        self.expect(id)
555        if self.token.lineno != line:
556            raise BadExpectedToken, "###  BAD LINENO: token '%s' got '%d' expecting '%d'" % (id,self.token.lineno,line)
557        if self.token.colno != col:
558            raise BadExpectedToken, "###  BAD COLNO: '%d' expecting '%d'" % (self.token.colno,col)
559
560    def expectTokenVal(self,id,value,line,col):
561        self.expectToken(id,line,col)
562        if self.token.value != value:
563            raise BadExpectedToken, "###  BAD VALUE: '%s' expecting '%s'" % (self.token.value,value)
564
565    def expectList(self,list):
566        for item in list:
567            self.expect(item)
568
569def test_CppTokenizer():
570    print "running CppTokenizer tests"
571    tester = CppTokenizerTester()
572
573    tester.setTokenizer( CppLineTokenizer("#an/example  && (01923_xy)") )
574    tester.expectList( ["#", "an", "/", "example", tokSPACE, tokLOGICAND, tokSPACE, tokLPAREN, "01923_xy", \
575                       tokRPAREN, tokLN, tokEOF] )
576
577    tester.setTokenizer( CppLineTokenizer("FOO(BAR) && defined(BAZ)") )
578    tester.expectList( ["FOO", tokLPAREN, "BAR", tokRPAREN, tokSPACE, tokLOGICAND, tokSPACE,
579                        tokDEFINED, tokLPAREN, "BAZ", tokRPAREN, tokLN, tokEOF] )
580
581    tester.setTokenizer( CppLinesTokenizer( ["/*", "#", "*/"] ) )
582    tester.expectList( [ tokSPACE, tokLN, tokEOF ] )
583
584    tester.setTokenizer( CppLinesTokenizer( ["first", "second"] ) )
585    tester.expectList( [ "first", tokLN, "second", tokLN, tokEOF ] )
586
587    tester.setTokenizer( CppLinesTokenizer( ["first second", "  third"] ) )
588    tester.expectToken( "first", 1, 0 )
589    tester.expectToken( tokSPACE, 1, 5 )
590    tester.expectToken( "second", 1, 6 )
591    tester.expectToken( tokLN, 1, 12 )
592    tester.expectToken( tokSPACE, 2, 0 )
593    tester.expectToken( "third", 2, 2 )
594
595    tester.setTokenizer( CppLinesTokenizer( [ "boo /* what the", "hell */" ] ) )
596    tester.expectList( [ "boo", tokSPACE ] )
597    tester.expectTokenVal( tokSPACE, "/* what the\nhell */", 1, 4 )
598    tester.expectList( [ tokLN, tokEOF ] )
599
600    tester.setTokenizer( CppLinesTokenizer( [ "an \\", " example" ] ) )
601    tester.expectToken( "an", 1, 0 )
602    tester.expectToken( tokSPACE, 1, 2 )
603    tester.expectTokenVal( tokSPACE, "\\", 1, 3 )
604    tester.expectToken( tokSPACE, 2, 0 )
605    tester.expectToken( "example", 2, 1 )
606    tester.expectToken( tokLN, 2, 8 )
607
608    return True
609
610
611#####################################################################################
612#####################################################################################
613#####                                                                           #####
614#####           C P P   E X P R E S S I O N S                                   #####
615#####                                                                           #####
616#####################################################################################
617#####################################################################################
618
619# Cpp expressions are modeled by tuples of the form (op,arg) or (op,arg1,arg2), etc..
620# op is an "operator" string
621
622class Expr:
623    """a class used to model a CPP expression"""
624    opInteger   = "int"
625    opIdent     = "ident"
626    opCall      = "call"
627    opDefined   = "defined"
628    opTest      = "?"
629    opLogicNot  = "!"
630    opNot       = "~"
631    opNeg       = "[-]"
632    opUnaryPlus = "[+]"
633    opAdd = "+"
634    opSub = "-"
635    opMul = "*"
636    opDiv = "/"
637    opMod = "%"
638    opAnd = "&"
639    opOr  = "|"
640    opXor = "^"
641    opLogicAnd = "&&"
642    opLogicOr  = "||"
643    opEqual = "=="
644    opNotEqual = "!="
645    opLess = "<"
646    opLessEq = "<="
647    opGreater = ">"
648    opGreaterEq = ">="
649    opShl = "<<"
650    opShr = ">>"
651
652    unaries  = [ opLogicNot, opNot, opNeg, opUnaryPlus ]
653    binaries = [ opAdd, opSub, opMul, opDiv, opMod, opAnd, opOr, opXor, opLogicAnd, opLogicOr,
654                 opEqual, opNotEqual, opLess, opLessEq, opGreater, opGreaterEq ]
655
656    precedences = {
657                    opTest: 0,
658                    opLogicOr:  1,
659                    opLogicNot: 2,
660                    opOr : 3,
661                    opXor: 4,
662                    opAnd: 5,
663                    opEqual: 6, opNotEqual: 6,
664                    opLess:7, opLessEq:7, opGreater:7, opGreaterEq:7,
665                    opShl:8, opShr:8,
666                    opAdd:9, opSub:9,
667                    opMul:10, opDiv:10, opMod:10,
668                    opLogicNot:11,
669                    opNot: 12,
670                    }
671
672    def __init__(self,op):
673        self.op = op
674
675    def __repr__(self):
676        return "(%s)" % self.op
677
678    def __str__(self):
679        return "operator(%s)" % self.op
680
681    def precedence(self):
682        """return the precedence of a given operator"""
683        return Expr.precedences.get(self.op, 1000)
684
685    def isUnary(self):
686        return self.op in Expr.unaries
687
688    def isBinary(self):
689        return self.op in Expr.binaries
690
691    def isDefined(self):
692        return self.op is opDefined
693
694    def toInt(self):
695        """return the integer value of a given expression. only valid for integer expressions
696           will return None otherwise"""
697        return None
698
699class IntExpr(Expr):
700    def __init__(self,value):
701        Expr.__init__(self,opInteger)
702        self.arg   = value
703
704    def __repr__(self):
705        return "(int %s)" % self.arg
706
707    def __str__(self):
708        return self.arg
709
710    def toInt(self):
711        s = self.arg  # string value
712        # get rid of U or L suffixes
713        while len(s) > 0 and s[-1] in "LUlu":
714            s = s[:-1]
715        return string.atoi(s)
716
717class IdentExpr(Expr):
718    def __init__(self,name):
719        Expr.__init__(self,opIdent)
720        self.name = name
721
722    def __repr__(self):
723        return "(ident %s)" % self.name
724
725    def __str__(self):
726        return self.name
727
728class CallExpr(Expr):
729    def __init__(self,funcname,params):
730        Expr.__init__(self,opCall)
731        self.funcname = funcname
732        self.params   = params
733
734    def __repr__(self):
735        result = "(call %s [" % self.funcname
736        comma  = ""
737        for param in self.params:
738            result += "%s%s" % (comma, repr(param))
739            comma   = ","
740        result += "])"
741        return result
742
743    def __str__(self):
744        result = "%s(" % self.funcname
745        comma = ""
746        for param in self.params:
747            result += "%s%s" % (comma, str(param))
748            comma = ","
749
750        result += ")"
751        return result
752
753class TestExpr(Expr):
754    def __init__(self,cond,iftrue,iffalse):
755        Expr.__init__(self,opTest)
756        self.cond    = cond
757        self.iftrue  = iftrue
758        self.iffalse = iffalse
759
760    def __repr__(self):
761        return "(?: %s %s %s)" % (repr(self.cond),repr(self.iftrue),repr(self.iffalse))
762
763    def __str__(self):
764        return "(%s) ? (%s) : (%s)" % (self.cond, self.iftrue, self.iffalse)
765
766class SingleArgExpr(Expr):
767    def __init__(self,op,arg):
768        Expr.__init__(self,op)
769        self.arg = arg
770
771    def __repr__(self):
772        return "(%s %s)" % (self.op, repr(self.arg))
773
774class DefinedExpr(SingleArgExpr):
775    def __init__(self,op,macroname):
776        SingleArgExpr.__init__(self.opDefined,macroname)
777
778    def __str__(self):
779        return "defined(%s)" % self.arg
780
781
782class UnaryExpr(SingleArgExpr):
783    def __init__(self,op,arg,opstr=None):
784        SingleArgExpr.__init__(self,op,arg)
785        if not opstr:
786            opstr = op
787        self.opstr = opstr
788
789    def __str__(self):
790        arg_s     = str(self.arg)
791        arg_prec  = self.arg.precedence()
792        self_prec = self.precedence()
793        if arg_prec < self_prec:
794            return "%s(%s)" % (self.opstr,arg_s)
795        else:
796            return "%s%s" % (self.opstr, arg_s)
797
798class TwoArgExpr(Expr):
799    def __init__(self,op,arg1,arg2):
800        Expr.__init__(self,op)
801        self.arg1 = arg1
802        self.arg2 = arg2
803
804    def __repr__(self):
805        return "(%s %s %s)" % (self.op, repr(self.arg1), repr(self.arg2))
806
807class BinaryExpr(TwoArgExpr):
808    def __init__(self,op,arg1,arg2,opstr=None):
809        TwoArgExpr.__init__(self,op,arg1,arg2)
810        if not opstr:
811            opstr = op
812        self.opstr = opstr
813
814    def __str__(self):
815        arg1_s    = str(self.arg1)
816        arg2_s    = str(self.arg2)
817        arg1_prec = self.arg1.precedence()
818        arg2_prec = self.arg2.precedence()
819        self_prec = self.precedence()
820
821        result = ""
822        if arg1_prec < self_prec:
823            result += "(%s)" % arg1_s
824        else:
825            result += arg1_s
826
827        result += " %s " % self.opstr
828
829        if arg2_prec < self_prec:
830            result += "(%s)" % arg2_s
831        else:
832            result += arg2_s
833
834        return result
835
836#####################################################################################
837#####################################################################################
838#####                                                                           #####
839#####           C P P   E X P R E S S I O N   P A R S E R                       #####
840#####                                                                           #####
841#####################################################################################
842#####################################################################################
843
844
845class ExprParser:
846    """a class used to convert a list of tokens into a cpp Expr object"""
847
848    re_octal   = re.compile(r"\s*\(0[0-7]+\).*")
849    re_decimal = re.compile(r"\s*\(\d+[ulUL]*\).*")
850    re_hexadecimal = re.compile(r"\s*\(0[xX][0-9a-fA-F]*\).*")
851
852    def __init__(self,tokens):
853        self.tok = tokens
854        self.n   = len(self.tok)
855        self.i   = 0
856
857    def mark(self):
858        return self.i
859
860    def release(self,pos):
861        self.i = pos
862
863    def peekId(self):
864        if self.i < self.n:
865            return self.tok[self.i].id
866        return None
867
868    def peek(self):
869        if self.i < self.n:
870            return self.tok[self.i]
871        return None
872
873    def skip(self):
874        if self.i < self.n:
875            self.i += 1
876
877    def skipOptional(self,id):
878        if self.i < self.n and self.tok[self.i].id == id:
879            self.i += 1
880
881    def skipSpaces(self):
882        i   = self.i
883        n   = self.n
884        tok = self.tok
885        while i < n and (tok[i] == tokSPACE or tok[i] == tokLN):
886            i += 1
887        self.i = i
888
889    # all the isXXX functions returns a (expr,nextpos) pair if a match is found
890    # or None if not
891
892    def is_integer(self):
893        id = self.tok[self.i].id
894        c  = id[0]
895        if c < '0' or c > '9':
896            return None
897
898        m = ExprParser.re_octal.match(id)
899        if m:
900            return (IntExpr(id), m.end(1))
901
902        m = ExprParser.re_decimal.match(id)
903        if m:
904            return (IntExpr(id), m.end(1))
905
906        m = ExprParser.re_hexadecimal(id)
907        if m:
908            return (IntExpr(id), m.end(1))
909
910        return None
911
912    def is_defined(self):
913        id = self.tok[self.i].id
914        if id != "defined":
915            return None
916
917        pos = self.mark()
918
919        use_paren = 0
920        if self.peekId() == tokLPAREN:
921            self.skip()
922            use_paren = 1
923
924        if self.peekId() != tokIDENT:
925            self.throw( BadExpectedToken, "identifier expected")
926
927        macroname = self.peek().value
928        self.skip()
929        if use_paren:
930            self.skipSpaces()
931            if self.peekId() != tokRPAREN:
932                self.throw( BadExpectedToken, "missing right-paren after 'defined' directive")
933            self.skip()
934
935        i = self.i
936        return (DefinedExpr(macroname),i+1)
937
938    def is_call_or_ident(self):
939        pass
940
941    def parse(self, i):
942        return None
943
944#####################################################################################
945#####################################################################################
946#####                                                                           #####
947#####           C P P   E X P R E S S I O N S                                   #####
948#####                                                                           #####
949#####################################################################################
950#####################################################################################
951
952class CppInvalidExpression(Exception):
953    """an exception raised when an invalid/unsupported cpp expression is detected"""
954    pass
955
956class CppExpr:
957    """a class that models the condition of #if directives into
958        an expression tree. each node in the tree is of the form (op,arg) or (op,arg1,arg2)
959        where "op" is a string describing the operation"""
960
961    unaries  = [ "!", "~" ]
962    binaries = [ "+", "-", "<", "<=", ">=", ">", "&&", "||", "*", "/", "%", "&", "|", "^", "<<", ">>", "==", "!=" ]
963    precedences = { "||": 1,
964                    "&&": 2,
965                     "|": 3,
966                     "^": 4,
967                     "&": 5,
968                     "==":6, "!=":6,
969                     "<":7, "<=":7, ">":7, ">=":7,
970                     "<<":8, ">>":8,
971                     "+":9, "-":9,
972                     "*":10, "/":10, "%":10,
973                     "!":11, "~":12
974                     }
975
976    def __init__(self, tokens):
977        """initialize a CppExpr. 'tokens' must be a CppToken list"""
978        self.tok  = tokens
979        self.n    = len(tokens)
980        if debugCppExpr:
981            print "CppExpr: trying to parse %s" % repr(tokens)
982        expr      = self.is_expr(0)
983        if debugCppExpr:
984            print "CppExpr: got " + repr(expr)
985        self.expr = expr[0]
986
987    re_cpp_constant = re.compile(r"((\d|\w|_)+)")
988
989    def throw(self,exception,i,msg):
990        if i < self.n:
991            tok = self.tok[i]
992            print "%d:%d: %s" % (tok.lineno,tok.colno,msg)
993        else:
994            print "EOF: %s" % msg
995        raise exception
996
997    def skip_spaces(self,i):
998        """skip spaces in input token list"""
999        while i < self.n:
1000            t = self.tok[i]
1001            if t.id != tokSPACE and t.id != tokLN:
1002                break
1003            i += 1
1004        return i
1005
1006    def expectId(self,i,id):
1007        """check that a given token id is at the current position, then skip over it"""
1008        i = self.skip_spaces(i)
1009        if i >= self.n or self.tok[i].id != id:
1010            self.throw(BadExpectedToken,i,"### expecting '%s' in expression, got '%s'" % (id, self.tok[i].id))
1011        return i+1
1012
1013    def expectIdent(self,i):
1014        i = self.skip_spaces(i)
1015        if i >= self.n or self.tok[i].id != tokIDENT:
1016            self.throw(BadExpectedToken,i,"### expecting identifier in expression, got '%s'" % (id, self.tok[i].id))
1017        return i+1
1018
1019    # the is_xxxxx function returns either None or a pair (e,nextpos)
1020    # where 'e' is an expression tuple (e.g. (op,arg)) and 'nextpos' is
1021    # the corresponding next position in the input token list
1022    #
1023
1024    def is_decimal(self,i):
1025        v = self.tok[i].value[:]
1026        while len(v) > 0 and v[-1] in "ULul":
1027            v = v[:-1]
1028        for digit in v:
1029            if not digit.isdigit():
1030                return None
1031
1032        # for an integer expression tuple, the argument
1033        # is simply the value as an integer
1034        val = string.atoi(v)
1035        return ("int", val), i+1
1036
1037    def is_hexadecimal(self,i):
1038        v = self.tok[i].value[:]
1039        while len(v) > 0 and v[-1] in "ULul":
1040            v = v[:-1]
1041        if len(v) > 2 and (v[0:2] == "0x" or v[0:2] == "0X"):
1042            for digit in v[2:]:
1043                if not digit in "0123456789abcdefABCDEF":
1044                    return None
1045
1046            # for an hex expression tuple, the argument
1047            # is the value as an integer
1048            val = int(v[2:], 16)
1049            return ("hex", val), i+1
1050
1051        return None
1052
1053    def is_integer(self,i):
1054        if self.tok[i].id != tokNUMBER:
1055            return None
1056
1057        c = self.is_decimal(i)
1058        if c: return c
1059
1060        c = self.is_hexadecimal(i)
1061        if c: return c
1062
1063        return None
1064
1065    def is_number(self,i):
1066        t = self.tok[i]
1067        if t.id == tokMINUS and i+1 < self.n:
1068            c = self.is_integer(i+1)
1069            if c:
1070                e, i2 = c
1071                op, val  = e
1072                return (op, -val), i2
1073        if t.id == tokPLUS and i+1 < self.n:
1074            c = self.is_integer(i+1)
1075            if c: return c
1076
1077        return self.is_integer(i)
1078
1079
1080    def is_alnum(self,i):
1081        """test wether a given token is alpha-numeric"""
1082        i = self.skip_spaces(i)
1083        if i >= self.n:
1084            return None
1085        t = self.tok[i]
1086        m = CppExpr.re_cpp_constant.match(t.id)
1087        if m:
1088            #print "... alnum '%s'" % m.group(1)
1089            r = m.group(1)
1090            return ("ident", r), i+1
1091        return None
1092
1093    def is_defined(self,i):
1094        t = self.tok[i]
1095        if t.id != tokDEFINED:
1096            return None
1097
1098        # we have the defined keyword, check the rest
1099        i = self.skip_spaces(i+1)
1100        use_parens = 0
1101        if i < self.n and self.tok[i].id == tokLPAREN:
1102            use_parens = 1
1103            i = self.skip_spaces(i+1)
1104
1105        if i >= self.n:
1106            self.throw(CppConstantExpected,i,"### 'defined' must be followed  by macro name or left paren")
1107
1108        t = self.tok[i]
1109        if t.id != tokIDENT:
1110            self.throw(CppConstantExpected,i,"### 'defined' must be followed by macro name")
1111
1112        i += 1
1113        if use_parens:
1114            i = self.expectId(i,tokRPAREN)
1115
1116        return ("defined",t.value), i
1117
1118
1119    def is_call_or_ident(self,i):
1120        i = self.skip_spaces(i)
1121        if i >= self.n:
1122            return None
1123
1124        t = self.tok[i]
1125        if t.id != tokIDENT:
1126            return None
1127
1128        name = t.value
1129
1130        i = self.skip_spaces(i+1)
1131        if i >= self.n or self.tok[i].id != tokLPAREN:
1132            return ("ident", name), i
1133
1134        params    = []
1135        depth     = 1
1136        i += 1
1137        j  = i
1138        while i < self.n:
1139            id = self.tok[i].id
1140            if id == tokLPAREN:
1141                depth += 1
1142            elif depth == 1 and (id == tokCOMMA or id == tokRPAREN):
1143                while j < i and self.tok[j].id == tokSPACE:
1144                    j += 1
1145                k = i
1146                while k > j and self.tok[k-1].id == tokSPACE:
1147                    k -= 1
1148                param = self.tok[j:k]
1149                params.append( param )
1150                if id == tokRPAREN:
1151                    break
1152                j = i+1
1153            elif id == tokRPAREN:
1154                depth -= 1
1155            i += 1
1156
1157        if i >= self.n:
1158            return None
1159
1160        return ("call", (name, params)), i+1
1161
1162    def is_token(self,i,token):
1163        i = self.skip_spaces(i)
1164        if i >= self.n or self.tok[i].id != token:
1165            return None
1166        return token, i+1
1167
1168
1169    def is_value(self,i):
1170        t = self.tok[i]
1171        if t.id == tokSTRING:
1172            return ("string", t.value), i+1
1173
1174        c = self.is_number(i)
1175        if c: return c
1176
1177        c = self.is_defined(i)
1178        if c: return c
1179
1180        c = self.is_call_or_ident(i)
1181        if c: return c
1182
1183        i = self.skip_spaces(i)
1184        if i >= self.n or self.tok[i].id != tokLPAREN:
1185            return None
1186
1187        popcount = 1
1188        i2       = i+1
1189        while i2 < self.n:
1190            t = self.tok[i2]
1191            if t.id == tokLPAREN:
1192                popcount += 1
1193            elif t.id == tokRPAREN:
1194                popcount -= 1
1195                if popcount == 0:
1196                    break
1197            i2 += 1
1198
1199        if popcount != 0:
1200            self.throw(CppInvalidExpression, i, "expression missing closing parenthesis")
1201
1202        if debugCppExpr:
1203            print "CppExpr: trying to parse sub-expression %s" % repr(self.tok[i+1:i2])
1204        oldcount   = self.n
1205        self.n     = i2
1206        c          = self.is_expr(i+1)
1207        self.n     = oldcount
1208        if not c:
1209            self.throw(CppInvalidExpression, i, "invalid expression within parenthesis")
1210
1211        e, i = c
1212        return e, i2+1
1213
1214    def is_unary(self,i):
1215        i = self.skip_spaces(i)
1216        if i >= self.n:
1217            return None
1218
1219        t = self.tok[i]
1220        if t.id in CppExpr.unaries:
1221            c = self.is_unary(i+1)
1222            if not c:
1223                self.throw(CppInvalidExpression, i, "%s operator must be followed by value" % t.id)
1224            e, i = c
1225            return (t.id, e), i
1226
1227        return self.is_value(i)
1228
1229    def is_binary(self,i):
1230        i = self.skip_spaces(i)
1231        if i >= self.n:
1232            return None
1233
1234        c = self.is_unary(i)
1235        if not c:
1236            return None
1237
1238        e1, i2 = c
1239        i2 = self.skip_spaces(i2)
1240        if i2 >= self.n:
1241            return c
1242
1243        t = self.tok[i2]
1244        if t.id in CppExpr.binaries:
1245            c = self.is_binary(i2+1)
1246            if not c:
1247                self.throw(CppInvalidExpression, i,"### %s operator must be followed by value" % t.id )
1248            e2, i3 = c
1249            return (t.id, e1, e2), i3
1250
1251        return None
1252
1253    def is_expr(self,i):
1254        return self.is_binary(i)
1255
1256    def dump_node(self,e):
1257        op = e[0]
1258        line = "(" + op
1259        if op == "int":
1260            line += " %d)" % e[1]
1261        elif op == "hex":
1262            line += " 0x%x)" % e[1]
1263        elif op == "ident":
1264            line += " %s)" % e[1]
1265        elif op == "defined":
1266            line += " %s)" % e[1]
1267        elif op == "call":
1268            arg = e[1]
1269            line += " %s [" % arg[0]
1270            prefix = ""
1271            for param in arg[1]:
1272                par = ""
1273                for tok in param:
1274                    par += str(tok)
1275                line += "%s%s" % (prefix, par)
1276                prefix = ","
1277            line += "])"
1278        elif op in CppExpr.unaries:
1279            line += " %s)" % self.dump_node(e[1])
1280        elif op in CppExpr.binaries:
1281            line += " %s %s)" % (self.dump_node(e[1]), self.dump_node(e[2]))
1282        else:
1283            line += " ?%s)" % repr(e[1])
1284
1285        return line
1286
1287    def __repr__(self):
1288        return self.dump_node(self.expr)
1289
1290    def source_node(self,e):
1291        op = e[0]
1292        if op == "int":
1293            return "%d" % e[1]
1294        if op == "hex":
1295            return "0x%x" % e[1]
1296        if op == "ident":
1297            # XXX: should try to expand
1298            return e[1]
1299        if op == "defined":
1300            return "defined(%s)" % e[1]
1301
1302        prec = CppExpr.precedences.get(op,1000)
1303        arg  = e[1]
1304        if op in CppExpr.unaries:
1305            arg_src = self.source_node(arg)
1306            arg_op  = arg[0]
1307            arg_prec = CppExpr.precedences.get(arg[0],1000)
1308            if arg_prec < prec:
1309                return "!(" + arg_src + ")"
1310            else:
1311                return "!" + arg_src
1312        if op in CppExpr.binaries:
1313            arg2     = e[2]
1314            arg1_op  = arg[0]
1315            arg2_op  = arg2[0]
1316            arg1_src = self.source_node(arg)
1317            arg2_src = self.source_node(arg2)
1318            if CppExpr.precedences.get(arg1_op,1000) < prec:
1319                arg1_src = "(%s)" % arg1_src
1320            if CppExpr.precedences.get(arg2_op,1000) < prec:
1321                arg2_src = "(%s)" % arg2_src
1322
1323            return "%s %s %s" % (arg1_src, op, arg2_src)
1324        return "???"
1325
1326    def __str__(self):
1327        return self.source_node(self.expr)
1328
1329    def int_node(self,e):
1330        if e[0] == "int":
1331            return e[1]
1332        elif e[1] == "hex":
1333            return int(e[1],16)
1334        else:
1335            return None
1336
1337    def toInt(self):
1338        return self.int_node(self.expr)
1339
1340    def optimize_node(self,e,macros={}):
1341        op = e[0]
1342        if op == "defined":
1343            name = e[1]
1344            if macros.has_key(name):
1345                if macros[name] == kCppUndefinedMacro:
1346                    return ("int", 0)
1347                else:
1348                    return ("int", 1)
1349
1350            if kernel_remove_config_macros and name.startswith("CONFIG_"):
1351                return ("int", 0)
1352
1353        elif op == "!":
1354            op, v = e
1355            v = self.optimize_node(v, macros)
1356            if v[0] == "int":
1357                if v[1] == 0:
1358                    return ("int", 1)
1359                else:
1360                    return ("int", 0)
1361
1362        elif op == "&&":
1363            op, l, r = e
1364            l  = self.optimize_node(l, macros)
1365            r  = self.optimize_node(r, macros)
1366            li = self.int_node(l)
1367            ri = self.int_node(r)
1368            if li != None:
1369                if li == 0:
1370                    return ("int", 0)
1371                else:
1372                    return r
1373
1374        elif op == "||":
1375            op, l, r = e
1376            l  = self.optimize_node(l, macros)
1377            r  = self.optimize_node(r, macros)
1378            li = self.int_node(l)
1379            ri = self.int_node(r)
1380            if li != None:
1381                if li == 0:
1382                    return r
1383                else:
1384                    return ("int", 1)
1385            elif ri != None:
1386                if ri == 0:
1387                    return l
1388                else:
1389                    return ("int", 1)
1390        return e
1391
1392    def optimize(self,macros={}):
1393        self.expr = self.optimize_node(self.expr,macros)
1394
1395    def removePrefixedNode(self,e,prefix,names):
1396        op = e[0]
1397        if op == "defined":
1398            name = e[1]
1399            if name.startswith(prefix):
1400                if names.has_key[name] and names[name] == "y":
1401                    return ("int", 1)
1402                else:
1403                    return ("int", 0)
1404
1405        elif op in CppExpr.unaries:
1406            op, v = e
1407            v = self.removePrefixedNode(v,prefix,names)
1408            return (op, v)
1409        elif op in CppExpr.binaries:
1410            op, v1, v2 = e
1411            v1 = self.removePrefixedNode(v1,prefix,names)
1412            v2 = self.removePrefixedNode(v2,prefix,names)
1413            return (op, v1, v2)
1414        elif op == "call":
1415            func, params = e[1]
1416            params2 = []
1417            for param in params:
1418                params2.append( self.removePrefixedNode(param,prefix,names) )
1419            return (op, (func, params2))
1420
1421        return e
1422
1423    def removePrefixed(self,prefix,names={}):
1424        self.expr = self.removePrefixedNode(self.expr,prefix,names)
1425
1426    def is_equal_node(self,e1,e2):
1427        if e1[0] != e2[0] or len(e1) != len(e2):
1428            return False
1429
1430        op = e1[0]
1431        if op == "int" or op == "hex" or op == "!" or op == "defined":
1432            return e1[0] == e2[0]
1433
1434        return self.is_equal_node(e1[1],e2[1]) and self.is_equal_node(e1[2],e2[2])
1435
1436    def is_equal(self,other):
1437        return self.is_equal_node(self.expr,other.expr)
1438
1439def test_cpp_expr(expr, expected):
1440    e = CppExpr( CppLineTokenizer( expr ).toTokenList() )
1441    #print repr(e.expr)
1442    s1 = repr(e)
1443    if s1 != expected:
1444        print "KO: expression '%s' generates '%s', should be '%s'" % (expr, s1, expected)
1445    else:
1446        #print "OK: expression '%s'" % expr
1447        pass
1448
1449def test_cpp_expr_optim(expr, expected, macros={}):
1450    e = CppExpr( CppLineTokenizer( expr ).toTokenList() )
1451    e.optimize(macros)
1452
1453    s1 = repr(e)
1454    if s1 != expected:
1455        print "KO: optimized expression '%s' generates '%s', should be '%s'" % (expr, s1, expected)
1456    else:
1457        #print "OK: optmized expression '%s'" % expr
1458        pass
1459
1460def test_cpp_expr_source(expr, expected):
1461    e = CppExpr( CppLineTokenizer( expr ).toTokenList() )
1462    s1 = str(e)
1463    if s1 != expected:
1464        print "KO: source expression '%s' generates '%s', should be '%s'" % (expr, s1, expected)
1465    else:
1466        #print "OK: source expression '%s'" % expr
1467        pass
1468
1469def test_CppExpr():
1470    print "testing CppExpr"
1471    test_cpp_expr( "0", "(int 0)" )
1472    test_cpp_expr( "1", "(int 1)" )
1473    test_cpp_expr( "1 && 1", "(&& (int 1) (int 1))" )
1474    test_cpp_expr( "1 && 0", "(&& (int 1) (int 0))" )
1475    test_cpp_expr( "EXAMPLE", "(ident EXAMPLE)" )
1476    test_cpp_expr( "EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))" )
1477    test_cpp_expr( "defined(EXAMPLE)", "(defined EXAMPLE)" )
1478    test_cpp_expr( "!defined(EXAMPLE)", "(! (defined EXAMPLE))" )
1479    test_cpp_expr( "defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))" )
1480    test_cpp_expr( "FOO(BAR)", "(call FOO [BAR])" )
1481
1482    test_cpp_expr_optim( "0", "(int 0)" )
1483    test_cpp_expr_optim( "1", "(int 1)" )
1484    test_cpp_expr_optim( "1 && 1", "(int 1)" )
1485    test_cpp_expr_optim( "1 && 0", "(int 0)" )
1486    test_cpp_expr_optim( "0 && 1", "(int 0)" )
1487    test_cpp_expr_optim( "0 && 0", "(int 0)" )
1488    test_cpp_expr_optim( "1 || 1", "(int 1)" )
1489    test_cpp_expr_optim( "1 || 0", "(int 1)" )
1490    test_cpp_expr_optim( "0 || 1", "(int 1)" )
1491    test_cpp_expr_optim( "0 || 0", "(int 0)" )
1492    test_cpp_expr_optim( "EXAMPLE", "(ident EXAMPLE)" )
1493    test_cpp_expr_optim( "EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))" )
1494    test_cpp_expr_optim( "defined(EXAMPLE)", "(defined EXAMPLE)" )
1495    test_cpp_expr_optim( "defined(EXAMPLE)", "(int 1)", { "EXAMPLE": "XOWOE" } )
1496    test_cpp_expr_optim( "defined(EXAMPLE)", "(int 0)", { "EXAMPLE": kCppUndefinedMacro} )
1497    test_cpp_expr_optim( "!defined(EXAMPLE)", "(! (defined EXAMPLE))" )
1498    test_cpp_expr_optim( "!defined(EXAMPLE)", "(int 0)", { "EXAMPLE" : "XOWOE" } )
1499    test_cpp_expr_optim( "!defined(EXAMPLE)", "(int 1)", { "EXAMPLE" : kCppUndefinedMacro } )
1500    test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))" )
1501    test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 1)", { "ABC" : "1" } )
1502    test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 1)", { "BINGO" : "1" } )
1503    test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(defined ABC)", { "BINGO" : kCppUndefinedMacro } )
1504    test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 0)", { "ABC" : kCppUndefinedMacro, "BINGO" : kCppUndefinedMacro } )
1505
1506    test_cpp_expr_source( "0", "0" )
1507    test_cpp_expr_source( "1", "1" )
1508    test_cpp_expr_source( "1 && 1", "1 && 1" )
1509    test_cpp_expr_source( "1 && 0", "1 && 0" )
1510    test_cpp_expr_source( "0 && 1", "0 && 1" )
1511    test_cpp_expr_source( "0 && 0", "0 && 0" )
1512    test_cpp_expr_source( "1 || 1", "1 || 1" )
1513    test_cpp_expr_source( "1 || 0", "1 || 0" )
1514    test_cpp_expr_source( "0 || 1", "0 || 1" )
1515    test_cpp_expr_source( "0 || 0", "0 || 0" )
1516    test_cpp_expr_source( "EXAMPLE", "EXAMPLE" )
1517    test_cpp_expr_source( "EXAMPLE - 3", "EXAMPLE - 3" )
1518    test_cpp_expr_source( "defined(EXAMPLE)", "defined(EXAMPLE)" )
1519    test_cpp_expr_source( "defined EXAMPLE", "defined(EXAMPLE)" )
1520
1521
1522#####################################################################################
1523#####################################################################################
1524#####                                                                           #####
1525#####          C P P   B L O C K                                                #####
1526#####                                                                           #####
1527#####################################################################################
1528#####################################################################################
1529
1530class Block:
1531    """a class used to model a block of input source text. there are two block types:
1532        - directive blocks: contain the tokens of a single pre-processor directive (e.g. #if)
1533        - text blocks, contain the tokens of non-directive blocks
1534
1535       the cpp parser class below will transform an input source file into a list of Block
1536       objects (grouped in a BlockList object for convenience)"""
1537
1538    def __init__(self,tokens,directive=None,lineno=0):
1539        """initialize a new block, if 'directive' is None, this is a text block
1540           NOTE: this automatically converts '#ifdef MACRO' into '#if defined(MACRO)'
1541                 and '#ifndef MACRO' into '#if !defined(MACRO)'"""
1542        if directive == "ifdef":
1543            tok = Token()
1544            tok.set(tokDEFINED)
1545            tokens = [ tok ] + tokens
1546            directive = "if"
1547
1548        elif directive == "ifndef":
1549            tok1 = Token()
1550            tok2 = Token()
1551            tok1.set(tokNOT)
1552            tok2.set(tokDEFINED)
1553            tokens = [ tok1, tok2 ] + tokens
1554            directive = "if"
1555
1556        self.tokens    = tokens
1557        self.directive = directive
1558        if lineno > 0:
1559            self.lineno = lineno
1560        else:
1561            self.lineno = self.tokens[0].lineno
1562
1563        if self.isIf():
1564            self.expr = CppExpr( self.tokens )
1565
1566    def isDirective(self):
1567        """returns True iff this is a directive block"""
1568        return self.directive != None
1569
1570    def isConditional(self):
1571        """returns True iff this is a conditional directive block"""
1572        return self.directive in ["if","ifdef","ifndef","else","elif","endif"]
1573
1574    def isDefine(self):
1575        """returns the macro name in a #define directive, or None otherwise"""
1576        if self.directive != "define":
1577            return None
1578
1579        return self.tokens[0].value
1580
1581    def isIf(self):
1582        """returns True iff this is an #if-like directive block"""
1583        return self.directive in ["if","ifdef","ifndef","elif"]
1584
1585    def isInclude(self):
1586        """checks wether this is a #include directive. if true, then returns the
1587           corresponding file name (with brackets or double-qoutes). None otherwise"""
1588        if self.directive != "include":
1589            return None
1590
1591        #print "iii " + repr(self.tokens)
1592        if self.tokens[0].id == tokSTRING:
1593            # a double-quote include, that's easy
1594            return self.tokens[0].value
1595
1596        # we only want the bracket part, not any comments or junk after it
1597        if self.tokens[0].id == "<":
1598            i   = 0
1599            tok = self.tokens
1600            n   = len(tok)
1601            while i < n and tok[i].id != ">":
1602                i += 1
1603
1604            if i >= n:
1605                return None
1606
1607            return string.join([ str(x) for x in tok[:i+1] ],"")
1608
1609        else:
1610            return None
1611
1612    def removeWhiteSpace(self):
1613        # Remove trailing whitespace and empty lines
1614        # All whitespace is also contracted to a single space
1615        if self.directive != None:
1616            return
1617
1618        tokens = []
1619        line   = 0     # index of line start
1620        space  = -1    # index of first space, or -1
1621        ii = 0
1622        nn = len(self.tokens)
1623        while ii < nn:
1624            tok = self.tokens[ii]
1625
1626            # If we find a space, record its position if this is the first
1627            # one the line start or the previous character. Don't append
1628            # anything to tokens array yet though.
1629            if tok.id == tokSPACE:
1630                if space < 0:
1631                    space = ii
1632                ii += 1
1633                continue
1634
1635            # If this is a line space, ignore the spaces we found previously
1636            # on the line, and remove empty lines.
1637            if tok.id == tokLN:
1638                old_line  = line
1639                old_space = space
1640                #print "N line=%d space=%d ii=%d" % (line, space, ii)
1641                ii   += 1
1642                line  = ii
1643                space = -1
1644                if old_space == old_line:  # line only contains spaces
1645                    #print "-s"
1646                    continue
1647                if ii-1 == old_line:  # line is empty
1648                    #print "-e"
1649                    continue
1650                tokens.append(tok)
1651                continue
1652
1653            # Other token, append any space range if any, converting each
1654            # one to a single space character, then append the token.
1655            if space >= 0:
1656                jj = space
1657                space = -1
1658                while jj < ii:
1659                    tok2 = self.tokens[jj]
1660                    tok2.value = " "
1661                    tokens.append(tok2)
1662                    jj += 1
1663
1664            tokens.append(tok)
1665            ii += 1
1666
1667        self.tokens = tokens
1668
1669    def writeWithWarning(self,out,warning,left_count,repeat_count):
1670        # removeWhiteSpace() will sometimes creates non-directive blocks
1671        # without any tokens. These come from blocks that only contained
1672        # empty lines and spaces. They should not be printed in the final
1673        # output, and then should not be counted for this operation.
1674        #
1675        if not self.directive and self.tokens == []:
1676            return left_count
1677
1678        if self.directive:
1679            out.write(str(self).rstrip() + "\n")
1680            left_count -= 1
1681            if left_count == 0:
1682                out.write(warning)
1683                left_count = repeat_count
1684
1685        else:
1686            for tok in self.tokens:
1687                out.write(str(tok))
1688                if tok.id == tokLN:
1689                    left_count -= 1
1690                    if left_count == 0:
1691                        out.write(warning)
1692                        left_count = repeat_count
1693
1694        return left_count
1695
1696
1697    def __repr__(self):
1698        """generate the representation of a given block"""
1699        if self.directive:
1700            result = "#%s " % self.directive
1701            if self.isIf():
1702                result += repr(self.expr)
1703            else:
1704                for tok in self.tokens:
1705                    result += repr(tok)
1706        else:
1707            result = ""
1708            for tok in self.tokens:
1709                result += repr(tok)
1710
1711        return result
1712
1713    def __str__(self):
1714        """generate the string representation of a given block"""
1715        if self.directive:
1716            if self.directive == "if":
1717                # small optimization to re-generate #ifdef and #ifndef
1718                e = self.expr.expr
1719                op = e[0]
1720                if op == "defined":
1721                    result = "#ifdef %s" % e[1]
1722                elif op == "!" and e[1][0] == "defined":
1723                    result = "#ifndef %s" % e[1][1]
1724                else:
1725                    result = "#if " + str(self.expr)
1726            else:
1727                result = "#%s" % self.directive
1728                if len(self.tokens):
1729                    result += " "
1730                for tok in self.tokens:
1731                    result += str(tok)
1732        else:
1733            result = ""
1734            for tok in self.tokens:
1735                result += str(tok)
1736
1737        return result
1738
1739class BlockList:
1740    """a convenience class used to hold and process a list of blocks returned by
1741       the cpp parser"""
1742    def __init__(self,blocks):
1743        self.blocks = blocks
1744
1745    def __len__(self):
1746        return len(self.blocks)
1747
1748    def __getitem__(self,n):
1749        return self.blocks[n]
1750
1751    def __repr__(self):
1752        return repr(self.blocks)
1753
1754    def __str__(self):
1755        result = ""
1756        for b in self.blocks:
1757            result += str(b)
1758            if b.isDirective():
1759                result = result.rstrip() + '\n'
1760        return result
1761
1762    def  optimizeIf01(self):
1763        """remove the code between #if 0 .. #endif in a BlockList"""
1764        self.blocks = optimize_if01(self.blocks)
1765
1766    def optimizeMacros(self, macros):
1767        """remove known defined and undefined macros from a BlockList"""
1768        for b in self.blocks:
1769            if b.isIf():
1770                b.expr.optimize(macros)
1771
1772    def removeMacroDefines(self,macros):
1773        """remove known macro definitions from a BlockList"""
1774        self.blocks = remove_macro_defines(self.blocks,macros)
1775
1776    def removePrefixed(self,prefix,names):
1777        for b in self.blocks:
1778            if b.isIf():
1779                b.expr.removePrefixed(prefix,names)
1780
1781    def removeWhiteSpace(self):
1782        for b in self.blocks:
1783            b.removeWhiteSpace()
1784
1785    def optimizeAll(self,macros):
1786        self.optimizeMacros(macros)
1787        self.optimizeIf01()
1788        return
1789
1790    def findIncludes(self):
1791        """return the list of included files in a BlockList"""
1792        result = []
1793        for b in self.blocks:
1794            i = b.isInclude()
1795            if i:
1796                result.append(i)
1797
1798        return result
1799
1800
1801    def write(self,out):
1802        out.write(str(self))
1803
1804    def writeWithWarning(self,out,warning,repeat_count):
1805        left_count = repeat_count
1806        for b in self.blocks:
1807            left_count = b.writeWithWarning(out,warning,left_count,repeat_count)
1808
1809    def removeComments(self):
1810        for b in self.blocks:
1811            for tok in b.tokens:
1812                if tok.id == tokSPACE:
1813                    tok.value = " "
1814
1815    def removeVarsAndFuncs(self,knownStatics=set()):
1816        """remove all extern and static declarations corresponding
1817           to variable and function declarations. we only accept typedefs
1818           and enum/structs/union declarations.
1819
1820           however, we keep the definitions corresponding to the set
1821           of known static inline functions in the set 'knownStatics',
1822           which is useful for optimized byteorder swap functions and
1823           stuff like that.
1824           """
1825        # state = 0 => normal (i.e. LN + spaces)
1826        # state = 1 => typedef/struct encountered, ends with ";"
1827        # state = 2 => var declaration encountered, ends with ";"
1828        # state = 3 => func declaration encountered, ends with "}"
1829        state      = 0
1830        depth      = 0
1831        blocks2    = []
1832        skipTokens = False
1833        for b in self.blocks:
1834            if b.isDirective():
1835                blocks2.append(b)
1836            else:
1837                n     = len(b.tokens)
1838                i     = 0
1839                if skipTokens:
1840                    first = n
1841                else:
1842                    first = 0
1843                while i < n:
1844                    tok = b.tokens[i]
1845                    tokid = tok.id
1846                    # If we are not looking for the start of a new
1847                    # type/var/func, then skip over tokens until
1848                    # we find our terminator, managing the depth of
1849                    # accolades as we go.
1850                    if state > 0:
1851                        terminator = False
1852                        if tokid == '{':
1853                            depth += 1
1854                        elif tokid == '}':
1855                            if depth > 0:
1856                                depth -= 1
1857                            if (depth == 0) and (state == 3):
1858                                terminator = True
1859                        elif tokid == ';' and depth == 0:
1860                            terminator = True
1861
1862                        if terminator:
1863                            # we found the terminator
1864                            state = 0
1865                            if skipTokens:
1866                                skipTokens = False
1867                                first = i+1
1868
1869                        i = i+1
1870                        continue
1871
1872                    # We are looking for the start of a new type/func/var
1873                    # ignore whitespace
1874                    if tokid in [tokLN, tokSPACE]:
1875                        i = i+1
1876                        continue
1877
1878                    # Is it a new type definition, then start recording it
1879                    if tok.value in [ 'struct', 'typedef', 'enum', 'union', '__extension__' ]:
1880                        #print "$$$ keep type declr" + repr(b.tokens[i:])
1881                        state = 1
1882                        i     = i+1
1883                        continue
1884
1885                    # Is it a variable or function definition. If so, first
1886                    # try to determine which type it is, and also extract
1887                    # its name.
1888                    #
1889                    # We're going to parse the next tokens of the same block
1890                    # until we find a semi-column or a left parenthesis.
1891                    #
1892                    # The semi-column corresponds to a variable definition,
1893                    # the left-parenthesis to a function definition.
1894                    #
1895                    # We also assume that the var/func name is the last
1896                    # identifier before the terminator.
1897                    #
1898                    j = i+1
1899                    ident = ""
1900                    while j < n:
1901                        tokid = b.tokens[j].id
1902                        if tokid == '(':  # a function declaration
1903                            state = 3
1904                            break
1905                        elif tokid == ';': # a variable declaration
1906                            state = 2
1907                            break
1908                        if tokid == tokIDENT:
1909                            ident = b.tokens[j].value
1910                        j += 1
1911
1912                    if j >= n:
1913                        # This can only happen when the declaration
1914                        # does not end on the current block (e.g. with
1915                        # a directive mixed inside it.
1916                        #
1917                        # We will treat it as malformed because
1918                        # it's very hard to recover from this case
1919                        # without making our parser much more
1920                        # complex.
1921                        #
1922                        #print "### skip unterminated static '%s'" % ident
1923                        break
1924
1925                    if ident in knownStatics:
1926                        #print "### keep var/func '%s': %s" % (ident,repr(b.tokens[i:j]))
1927                        pass
1928                    else:
1929                        # We're going to skip the tokens for this declaration
1930                        #print "### skip variable /func'%s': %s" % (ident,repr(b.tokens[i:j]))
1931                        if i > first:
1932                            blocks2.append( Block(b.tokens[first:i]))
1933                        skipTokens = True
1934                        first      = n
1935
1936                    i = i+1
1937
1938                if i > first:
1939                    #print "### final '%s'" % repr(b.tokens[first:i])
1940                    blocks2.append( Block(b.tokens[first:i]) )
1941
1942        self.blocks = blocks2
1943
1944    def insertDisclaimer(self,disclaimer="/* auto-generated file, DO NOT EDIT */"):
1945        """insert your standard issue disclaimer that this is an
1946           auto-generated file, etc.."""
1947        tokens = CppLineTokenizer( disclaimer ).toTokenList()
1948        tokens = tokens[:-1]  # remove trailing tokLN
1949        self.blocks = [ Block(tokens) ] + self.blocks
1950
1951    def replaceTokens(self,replacements=dict()):
1952        """replace tokens according to the given dict
1953           """
1954        for b in self.blocks:
1955            if not b.isDirective():
1956                for tok in b.tokens:
1957                    if tok.id == tokIDENT:
1958                        if tok.value in replacements:
1959                            tok.value = replacements[tok.value]
1960
1961class BlockParser:
1962    """a class used to convert an input source file into a BlockList object"""
1963
1964    def __init__(self,tokzer=None):
1965        """initialize a block parser. the input source is provided through a Tokenizer
1966           object"""
1967        self.reset(tokzer)
1968
1969    def reset(self,tokzer):
1970        self.state  = 1
1971        self.tokzer = tokzer
1972
1973    def getBlocks(self,tokzer=None):
1974        """tokenize and parse the input source, return a BlockList object
1975           NOTE: empty and line-numbering directives are ignored and removed
1976                 from the result. as a consequence, it is possible to have
1977                 two successive text blocks in the result"""
1978        # state 0 => in source code
1979        # state 1 => in source code, after a LN
1980        # state 2 => in source code, after LN then some space
1981        state   = 1
1982        lastLN  = 0
1983        current = []
1984        blocks  = []
1985
1986        if tokzer == None:
1987            tokzer = self.tokzer
1988
1989        while 1:
1990            tok = tokzer.getToken()
1991            if tok.id == tokEOF:
1992                break
1993
1994            if tok.id == tokLN:
1995                state    = 1
1996                current.append(tok)
1997                lastLN   = len(current)
1998
1999            elif tok.id == tokSPACE:
2000                if state == 1:
2001                    state = 2
2002                current.append(tok)
2003
2004            elif tok.id == "#":
2005                if state > 0:
2006                    # this is the start of a directive
2007
2008                    if lastLN > 0:
2009                        # record previous tokens as text block
2010                        block   = Block(current[:lastLN])
2011                        blocks.append(block)
2012                        lastLN  = 0
2013
2014                    current = []
2015
2016                    # skip spaces after the #
2017                    while 1:
2018                        tok = tokzer.getToken()
2019                        if tok.id != tokSPACE:
2020                            break
2021
2022                    if tok.id != tokIDENT:
2023                        # empty or line-numbering, ignore it
2024                        if tok.id != tokLN and tok.id != tokEOF:
2025                            while 1:
2026                                tok = tokzer.getToken()
2027                                if tok.id == tokLN or tok.id == tokEOF:
2028                                    break
2029                        continue
2030
2031                    directive = tok.value
2032                    lineno    = tok.lineno
2033
2034                    # skip spaces
2035                    tok = tokzer.getToken()
2036                    while tok.id == tokSPACE:
2037                        tok = tokzer.getToken()
2038
2039                    # then record tokens until LN
2040                    dirtokens = []
2041                    while tok.id != tokLN and tok.id != tokEOF:
2042                        dirtokens.append(tok)
2043                        tok = tokzer.getToken()
2044
2045                    block = Block(dirtokens,directive,lineno)
2046                    blocks.append(block)
2047                    state   = 1
2048
2049            else:
2050                state = 0
2051                current.append(tok)
2052
2053        if len(current) > 0:
2054            block = Block(current)
2055            blocks.append(block)
2056
2057        return BlockList(blocks)
2058
2059    def parse(self,tokzer):
2060        return self.getBlocks( tokzer )
2061
2062    def parseLines(self,lines):
2063        """parse a list of text lines into a BlockList object"""
2064        return self.getBlocks( CppLinesTokenizer(lines) )
2065
2066    def parseFile(self,path):
2067        """parse a file into a BlockList object"""
2068        file = open(path, "rt")
2069        result = self.getBlocks( CppFileTokenizer(file) )
2070        file.close()
2071        return result
2072
2073
2074def test_block_parsing(lines,expected):
2075    blocks = BlockParser().parse( CppLinesTokenizer(lines) )
2076    if len(blocks) != len(expected):
2077        raise BadExpectedToken, "parser.buildBlocks returned '%s' expecting '%s'" \
2078              % (str(blocks), repr(expected))
2079    for n in range(len(blocks)):
2080        if str(blocks[n]) != expected[n]:
2081            raise BadExpectedToken, "parser.buildBlocks()[%d] is '%s', expecting '%s'" \
2082                  % (n, str(blocks[n]), expected[n])
2083    #for block in blocks:
2084    #    print block
2085
2086def test_BlockParser():
2087    test_block_parsing(["#error hello"],["#error hello"])
2088    test_block_parsing([ "foo", "", "bar" ], [ "foo\n\nbar\n" ])
2089    test_block_parsing([ "foo", "  #  ", "bar" ], [ "foo\n","bar\n" ])
2090    test_block_parsing(\
2091        [ "foo", "   #  ", "  #  /* ahah */ if defined(__KERNEL__) ", "bar", "#endif" ],
2092        [ "foo\n", "#ifdef __KERNEL__", "bar\n", "#endif" ] )
2093
2094
2095#####################################################################################
2096#####################################################################################
2097#####                                                                           #####
2098#####        B L O C K   L I S T   O P T I M I Z A T I O N                      #####
2099#####                                                                           #####
2100#####################################################################################
2101#####################################################################################
2102
2103def  remove_macro_defines( blocks, excludedMacros=set() ):
2104    """remove macro definitions like #define <macroName>  ...."""
2105    result = []
2106    for b in blocks:
2107        macroName = b.isDefine()
2108        if macroName == None or not macroName in excludedMacros:
2109            result.append(b)
2110
2111    return result
2112
2113def  find_matching_endif( blocks, i ):
2114    n     = len(blocks)
2115    depth = 1
2116    while i < n:
2117        if blocks[i].isDirective():
2118            dir = blocks[i].directive
2119            if dir in [ "if", "ifndef", "ifdef" ]:
2120                depth += 1
2121            elif depth == 1 and dir in [ "else", "elif" ]:
2122                return i
2123            elif dir == "endif":
2124                depth -= 1
2125                if depth == 0:
2126                    return i
2127        i += 1
2128    return i
2129
2130def  optimize_if01( blocks ):
2131    """remove the code between #if 0 .. #endif in a list of CppBlocks"""
2132    i = 0
2133    n = len(blocks)
2134    result = []
2135    while i < n:
2136        j = i
2137        while j < n and not blocks[j].isIf():
2138            j += 1
2139        if j > i:
2140            D2("appending lines %d to %d" % (blocks[i].lineno, blocks[j-1].lineno))
2141            result += blocks[i:j]
2142        if j >= n:
2143            break
2144        expr = blocks[j].expr
2145        r    = expr.toInt()
2146        if r == None:
2147            result.append(blocks[j])
2148            i = j + 1
2149            continue
2150
2151        if r == 0:
2152            # if 0 => skip everything until the corresponding #endif
2153            j = find_matching_endif( blocks, j+1 )
2154            if j >= n:
2155                # unterminated #if 0, finish here
2156                break
2157            dir = blocks[j].directive
2158            if dir == "endif":
2159                D2("remove 'if 0' .. 'endif' (lines %d to %d)" % (blocks[i].lineno, blocks[j].lineno))
2160                i = j + 1
2161            elif dir == "else":
2162                # convert 'else' into 'if 1'
2163                D2("convert 'if 0' .. 'else' into 'if 1' (lines %d to %d)" % (blocks[i].lineno, blocks[j-1].lineno))
2164                blocks[j].directive = "if"
2165                blocks[j].expr      = CppExpr( CppLineTokenizer("1").toTokenList() )
2166                i = j
2167            elif dir == "elif":
2168                # convert 'elif' into 'if'
2169                D2("convert 'if 0' .. 'elif' into 'if'")
2170                blocks[j].directive = "if"
2171                i = j
2172            continue
2173
2174        # if 1 => find corresponding endif and remove/transform them
2175        k = find_matching_endif( blocks, j+1 )
2176        if k >= n:
2177            # unterminated #if 1, finish here
2178            D2("unterminated 'if 1'")
2179            result += blocks[j+1:k]
2180            break
2181
2182        dir = blocks[k].directive
2183        if dir == "endif":
2184            D2("convert 'if 1' .. 'endif' (lines %d to %d)"  % (blocks[j].lineno, blocks[k].lineno))
2185            result += optimize_if01(blocks[j+1:k])
2186            i       = k+1
2187        elif dir == "else":
2188            # convert 'else' into 'if 0'
2189            D2("convert 'if 1' .. 'else' (lines %d to %d)"  % (blocks[j].lineno, blocks[k].lineno))
2190            result += optimize_if01(blocks[j+1:k])
2191            blocks[k].directive = "if"
2192            blocks[k].expr      = CppExpr( CppLineTokenizer("0").toTokenList() )
2193            i = k
2194        elif dir == "elif":
2195            # convert 'elif' into 'if 0'
2196            D2("convert 'if 1' .. 'elif' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno))
2197            result += optimize_if01(blocks[j+1:k])
2198            blocks[k].expr      = CppExpr( CppLineTokenizer("0").toTokenList() )
2199            i = k
2200    return result
2201
2202def  test_optimizeAll():
2203    text = """\
2204#if 1
2205#define  GOOD_1
2206#endif
2207#if 0
2208#define  BAD_2
2209#define  BAD_3
2210#endif
2211
2212#if 1
2213#define  GOOD_2
2214#else
2215#define  BAD_4
2216#endif
2217
2218#if 0
2219#define  BAD_5
2220#else
2221#define  GOOD_3
2222#endif
2223
2224#if 0
2225#if 1
2226#define  BAD_6
2227#endif
2228#endif\
2229"""
2230
2231    expected = """\
2232#define GOOD_1
2233
2234#define GOOD_2
2235
2236#define GOOD_3
2237
2238"""
2239
2240    print "running test_BlockList.optimizeAll"
2241    out = StringOutput()
2242    lines = string.split(text, '\n')
2243    list = BlockParser().parse( CppLinesTokenizer(lines) )
2244    #D_setlevel(2)
2245    list.optimizeAll( {"__KERNEL__":kCppUndefinedMacro} )
2246    #print repr(list)
2247    list.write(out)
2248    if out.get() != expected:
2249        print "KO: macro optimization failed\n"
2250        print "<<<< expecting '",
2251        print expected,
2252        print "'\n>>>> result '"
2253        print out.get(),
2254        print "'\n----"
2255
2256
2257#####################################################################################
2258#####################################################################################
2259#####                                                                           #####
2260#####                                                                           #####
2261#####                                                                           #####
2262#####################################################################################
2263#####################################################################################
2264
2265def runUnitTests():
2266    """run all unit tests for this program"""
2267    print "running unit tests"
2268    test_CppTokenizer()
2269    test_CppExpr()
2270    test_optimizeAll()
2271    test_BlockParser()
2272    print "OK"
2273
2274if __name__ == "__main__":
2275    runUnitTests()
2276