diff options
Diffstat (limited to 'libc/kernel/tools/cpp.py')
-rw-r--r-- | libc/kernel/tools/cpp.py | 2180 |
1 files changed, 2180 insertions, 0 deletions
diff --git a/libc/kernel/tools/cpp.py b/libc/kernel/tools/cpp.py new file mode 100644 index 0000000..4b4bd38 --- /dev/null +++ b/libc/kernel/tools/cpp.py @@ -0,0 +1,2180 @@ +# a glorified C pre-processor parser + +import sys, re, string +from utils import * +from defaults import * + +debugTokens = False +debugDirectiveTokenizer = False +debugLineParsing = False +debugCppExpr = False +debugOptimIf01 = False + +##################################################################################### +##################################################################################### +##### ##### +##### C P P T O K E N S ##### +##### ##### +##################################################################################### +##################################################################################### + +# the list of supported C-preprocessor tokens +# plus a couple of C tokens as well +tokEOF = "\0" +tokLN = "\n" +tokSTRINGIFY = "#" +tokCONCAT = "##" +tokLOGICAND = "&&" +tokLOGICOR = "||" +tokSHL = "<<" +tokSHR = ">>" +tokEQUAL = "==" +tokNEQUAL = "!=" +tokLT = "<" +tokLTE = "<=" +tokGT = ">" +tokGTE = ">=" +tokELLIPSIS = "..." +tokSPACE = " " +tokDEFINED = "defined" +tokLPAREN = "(" +tokRPAREN = ")" +tokNOT = "!" +tokPLUS = "+" +tokMINUS = "-" +tokMULTIPLY = "*" +tokDIVIDE = "/" +tokMODULUS = "%" +tokBINAND = "&" +tokBINOR = "|" +tokBINXOR = "^" +tokCOMMA = "," +tokLBRACE = "{" +tokRBRACE = "}" +tokARROW = "->" +tokINCREMENT = "++" +tokDECREMENT = "--" +tokNUMBER = "<number>" +tokIDENT = "<ident>" +tokSTRING = "<string>" + +class Token: + """a simple class to hold information about a given token. + each token has a position in the source code, as well as + an 'id' and a 'value'. the id is a string that identifies + the token's class, while the value is the string of the + original token itself. + + for example, the tokenizer concatenates a series of spaces + and tabs as a single tokSPACE id, whose value if the original + spaces+tabs sequence.""" + + def __init__(self): + self.id = None + self.value = None + self.lineno = 0 + self.colno = 0 + + def set(self,id,val=None): + self.id = id + if val: + self.value = val + else: + self.value = id + return None + + def copyFrom(self,src): + self.id = src.id + self.value = src.value + self.lineno = src.lineno + self.colno = src.colno + + def __repr__(self): + if self.id == tokIDENT: + return "(ident %s)" % self.value + if self.id == tokNUMBER: + return "(number %s)" % self.value + if self.id == tokSTRING: + return "(string '%s')" % self.value + if self.id == tokLN: + return "<LN>" + if self.id == tokEOF: + return "<EOF>" + if self.id == tokSPACE and self.value == "\\": + # this corresponds to a trailing \ that was transformed into a tokSPACE + return "<\\>" + + return self.id + + def __str__(self): + if self.id == tokIDENT: + return self.value + if self.id == tokNUMBER: + return self.value + if self.id == tokSTRING: + return self.value + if self.id == tokEOF: + return "<EOF>" + if self.id == tokSPACE: + if self.value == "\\": # trailing \ + return "\\\n" + else: + return self.value + + return self.id + +class BadExpectedToken(Exception): + def __init__(self,msg): + print msg + +##################################################################################### +##################################################################################### +##### ##### +##### C P P T O K E N C U R S O R ##### +##### ##### +##################################################################################### +##################################################################################### + +class TokenCursor: + """a small class to iterate over a list of Token objects""" + def __init__(self,tokens): + self.tokens = tokens + self.n = 0 + self.count = len(tokens) + + def set(self,n): + """set the current position""" + if n < 0: + n = 0 + if n > self.count: + n = self.count + self.n = n + + def peekId(self): + """retrieve the id of the current token""" + if (self.n >= self.count): + return None + return self.tokens[self.n].id + + def peek(self): + """retrieve the current token. does not change position""" + if (self.n >= self.count): + return None + return self.tokens[self.n] + + def skip(self): + """increase current token position""" + if (self.n < self.count): + self.n += 1 + + def skipSpaces(self): + """skip over all space tokens, this includes tokSPACE and tokLN""" + while 1: + tok = self.peekId() + if tok != tokSPACE and tok != tokLN: + break + self.skip() + + def skipIfId(self,id): + """skip an optional token""" + if self.peekId() == id: + self.skip() + + def expectId(self,id): + """raise an exception if the current token hasn't a given id. + otherwise skip over it""" + tok = self.peek() + if tok.id != id: + raise BadExpectedToken, "%d:%d: '%s' expected, received '%s'" % (tok.lineno, tok.colno, id, tok.id) + self.skip() + + def remain(self): + """return the list of remaining tokens""" + return self.tokens[self.n:] + + +##################################################################################### +##################################################################################### +##### ##### +##### C P P T O K E N I Z E R ##### +##### ##### +##################################################################################### +##################################################################################### + +# list of long symbols, i.e. those that take more than one characters +cppLongSymbols = [ tokCONCAT, tokLOGICAND, tokLOGICOR, tokSHL, tokSHR, tokELLIPSIS, tokEQUAL,\ + tokNEQUAL, tokLTE, tokGTE, tokARROW, tokINCREMENT, tokDECREMENT ] + +class CppTokenizer: + """an abstract class used to convert some input text into a list + of tokens. real implementations follow and differ in the format + of the input text only""" + + def __init__(self): + """initialize a new CppTokenizer object""" + self.eof = False # end of file reached ? + self.text = None # content of current line, with final \n stripped + self.line = 0 # number of current line + self.pos = 0 # current character position in current line + self.len = 0 # length of current line text + self.held = Token() + + def setLineText(self,line): + """set the content of the (next) current line. should be called + by fillLineText() in derived classes""" + self.text = line + self.len = len(line) + self.pos = 0 + + def fillLineText(self): + """refresh the content of 'line' with a new line of input""" + # to be overriden + self.eof = True + + def markPos(self,tok): + """mark the position of the current token in the source file""" + if self.eof or self.pos > self.len: + tok.lineno = self.line + 1 + tok.colno = 0 + else: + tok.lineno = self.line + tok.colno = self.pos + + def peekChar(self): + """return the current token under the cursor without moving it""" + if self.eof: + return tokEOF + + if self.pos > self.len: + self.pos = 0 + self.line += 1 + self.fillLineText() + if self.eof: + return tokEOF + + if self.pos == self.len: + return tokLN + else: + return self.text[self.pos] + + def peekNChar(self,n): + """try to peek the next n chars on the same line""" + if self.pos + n > self.len: + return None + return self.text[self.pos:self.pos+n] + + def skipChar(self): + """increment the token cursor position""" + if not self.eof: + self.pos += 1 + + def skipNChars(self,n): + if self.pos + n <= self.len: + self.pos += n + else: + while n > 0: + self.skipChar() + n -= 1 + + def nextChar(self): + """retrieve the token at the current cursor position, then skip it""" + result = self.peekChar() + self.skipChar() + return result + + def getEscape(self): + # try to get all characters after a backslash (\) + result = self.nextChar() + if result == "0": + # octal number ? + num = self.peekNChar(3) + if num != None: + isOctal = True + for d in num: + if not d in "01234567": + isOctal = False + break + if isOctal: + result += num + self.skipNChars(3) + elif result == "x" or result == "X": + # hex number ? + num = self.peekNChar(2) + if num != None: + isHex = True + for d in num: + if not d in "012345678abcdefABCDEF": + isHex = False + break + if isHex: + result += num + self.skipNChars(2) + elif result == "u" or result == "U": + # unicode char ? + num = self.peekNChar(4) + if num != None: + isHex = True + for d in num: + if not d in "012345678abcdefABCDEF": + isHex = False + break + if isHex: + result += num + self.skipNChars(4) + + return result + + def nextRealToken(self,tok): + """return next CPP token, used internally by nextToken()""" + c = self.nextChar() + if c == tokEOF or c == tokLN: + return tok.set(c) + + if c == '/': + c = self.peekChar() + if c == '/': # C++ comment line + self.skipChar() + while 1: + c = self.nextChar() + if c == tokEOF or c == tokLN: + break + return tok.set(tokLN) + if c == '*': # C comment start + self.skipChar() + value = "/*" + prev_c = None + while 1: + c = self.nextChar() + if c == tokEOF: + #print "## EOF after '%s'" % value + return tok.set(tokEOF,value) + if c == '/' and prev_c == '*': + break + prev_c = c + value += c + + value += "/" + #print "## COMMENT: '%s'" % value + return tok.set(tokSPACE,value) + c = '/' + + if c.isspace(): + while 1: + c2 = self.peekChar() + if c2 == tokLN or not c2.isspace(): + break + c += c2 + self.skipChar() + return tok.set(tokSPACE,c) + + if c == '\\': + if debugTokens: + print "nextRealToken: \\ found, next token is '%s'" % repr(self.peekChar()) + if self.peekChar() == tokLN: # trailing \ + # eat the tokLN + self.skipChar() + # we replace a trailing \ by a tokSPACE whose value is + # simply "\\". this allows us to detect them later when + # needed. + return tok.set(tokSPACE,"\\") + else: + # treat as a single token here ? + c +=self.getEscape() + return tok.set(c) + + if c == "'": # chars + c2 = self.nextChar() + c += c2 + if c2 == '\\': + c += self.getEscape() + + while 1: + c2 = self.nextChar() + if c2 == tokEOF: + break + c += c2 + if c2 == "'": + break + + return tok.set(tokSTRING, c) + + if c == '"': # strings + quote = 0 + while 1: + c2 = self.nextChar() + if c2 == tokEOF: + return tok.set(tokSTRING,c) + + c += c2 + if not quote: + if c2 == '"': + return tok.set(tokSTRING,c) + if c2 == "\\": + quote = 1 + else: + quote = 0 + + if c >= "0" and c <= "9": # integers ? + while 1: + c2 = self.peekChar() + if c2 == tokLN or (not c2.isalnum() and c2 != "_"): + break + c += c2 + self.skipChar() + return tok.set(tokNUMBER,c) + + if c.isalnum() or c == "_": # identifiers ? + while 1: + c2 = self.peekChar() + if c2 == tokLN or (not c2.isalnum() and c2 != "_"): + break + c += c2 + self.skipChar() + if c == tokDEFINED: + return tok.set(tokDEFINED) + else: + return tok.set(tokIDENT,c) + + # check special symbols + for sk in cppLongSymbols: + if c == sk[0]: + sklen = len(sk[1:]) + if self.pos + sklen <= self.len and \ + self.text[self.pos:self.pos+sklen] == sk[1:]: + self.pos += sklen + return tok.set(sk) + + return tok.set(c) + + def nextToken(self,tok): + """return the next token from the input text. this function + really updates 'tok', and does not return a new one""" + self.markPos(tok) + self.nextRealToken(tok) + + def getToken(self): + tok = Token() + self.nextToken(tok) + if debugTokens: + print "getTokens: %s" % repr(tok) + return tok + + def toTokenList(self): + """convert the input text of a CppTokenizer into a direct + list of token objects. tokEOF is stripped from the result""" + result = [] + while 1: + tok = Token() + self.nextToken(tok) + if tok.id == tokEOF: + break + result.append(tok) + return result + +class CppLineTokenizer(CppTokenizer): + """a CppTokenizer derived class that accepts a single line of text as input""" + def __init__(self,line,lineno=1): + CppTokenizer.__init__(self) + self.line = lineno + self.setLineText(line) + + +class CppLinesTokenizer(CppTokenizer): + """a CppTokenizer derived class that accepts a list of texdt lines as input. + the lines must not have a trailing \n""" + def __init__(self,lines=[],lineno=1): + """initialize a CppLinesTokenizer. you can later add lines using addLines()""" + CppTokenizer.__init__(self) + self.line = lineno + self.lines = lines + self.index = 0 + self.count = len(lines) + + if self.count > 0: + self.fillLineText() + else: + self.eof = True + + def addLine(self,line): + """add a line to a CppLinesTokenizer. this can be done after tokenization + happens""" + if self.count == 0: + self.setLineText(line) + self.index = 1 + self.lines.append(line) + self.count += 1 + self.eof = False + + def fillLineText(self): + if self.index < self.count: + self.setLineText(self.lines[self.index]) + self.index += 1 + else: + self.eof = True + + +class CppFileTokenizer(CppTokenizer): + def __init__(self,file,lineno=1): + CppTokenizer.__init__(self) + self.file = file + self.line = lineno + + def fillLineText(self): + line = self.file.readline() + if len(line) > 0: + if line[-1] == '\n': + line = line[:-1] + if len(line) > 0 and line[-1] == "\r": + line = line[:-1] + self.setLineText(line) + else: + self.eof = True + +# Unit testing +# +class CppTokenizerTester: + """a class used to test CppTokenizer classes""" + def __init__(self,tokenizer=None): + self.tokenizer = tokenizer + self.token = Token() + + def setTokenizer(self,tokenizer): + self.tokenizer = tokenizer + + def expect(self,id): + self.tokenizer.nextToken(self.token) + tokid = self.token.id + if tokid == id: + return + if self.token.value == id and (tokid == tokIDENT or tokid == tokNUMBER): + return + raise BadExpectedToken, "### BAD TOKEN: '%s' expecting '%s'" % (self.token.id,id) + + def expectToken(self,id,line,col): + self.expect(id) + if self.token.lineno != line: + raise BadExpectedToken, "### BAD LINENO: token '%s' got '%d' expecting '%d'" % (id,self.token.lineno,line) + if self.token.colno != col: + raise BadExpectedToken, "### BAD COLNO: '%d' expecting '%d'" % (self.token.colno,col) + + def expectTokenVal(self,id,value,line,col): + self.expectToken(id,line,col) + if self.token.value != value: + raise BadExpectedToken, "### BAD VALUE: '%s' expecting '%s'" % (self.token.value,value) + + def expectList(self,list): + for item in list: + self.expect(item) + +def test_CppTokenizer(): + print "running CppTokenizer tests" + tester = CppTokenizerTester() + + tester.setTokenizer( CppLineTokenizer("#an/example && (01923_xy)") ) + tester.expectList( ["#", "an", "/", "example", tokSPACE, tokLOGICAND, tokSPACE, tokLPAREN, "01923_xy", \ + tokRPAREN, tokLN, tokEOF] ) + + tester.setTokenizer( CppLineTokenizer("FOO(BAR) && defined(BAZ)") ) + tester.expectList( ["FOO", tokLPAREN, "BAR", tokRPAREN, tokSPACE, tokLOGICAND, tokSPACE, + tokDEFINED, tokLPAREN, "BAZ", tokRPAREN, tokLN, tokEOF] ) + + tester.setTokenizer( CppLinesTokenizer( ["/*", "#", "*/"] ) ) + tester.expectList( [ tokSPACE, tokLN, tokEOF ] ) + + tester.setTokenizer( CppLinesTokenizer( ["first", "second"] ) ) + tester.expectList( [ "first", tokLN, "second", tokLN, tokEOF ] ) + + tester.setTokenizer( CppLinesTokenizer( ["first second", " third"] ) ) + tester.expectToken( "first", 1, 0 ) + tester.expectToken( tokSPACE, 1, 5 ) + tester.expectToken( "second", 1, 6 ) + tester.expectToken( tokLN, 1, 12 ) + tester.expectToken( tokSPACE, 2, 0 ) + tester.expectToken( "third", 2, 2 ) + + tester.setTokenizer( CppLinesTokenizer( [ "boo /* what the", "hell */" ] ) ) + tester.expectList( [ "boo", tokSPACE ] ) + tester.expectTokenVal( tokSPACE, "/* what the\nhell */", 1, 4 ) + tester.expectList( [ tokLN, tokEOF ] ) + + tester.setTokenizer( CppLinesTokenizer( [ "an \\", " example" ] ) ) + tester.expectToken( "an", 1, 0 ) + tester.expectToken( tokSPACE, 1, 2 ) + tester.expectTokenVal( tokSPACE, "\\", 1, 3 ) + tester.expectToken( tokSPACE, 2, 0 ) + tester.expectToken( "example", 2, 1 ) + tester.expectToken( tokLN, 2, 8 ) + + return True + + +##################################################################################### +##################################################################################### +##### ##### +##### C P P E X P R E S S I O N S ##### +##### ##### +##################################################################################### +##################################################################################### + +# Cpp expressions are modeled by tuples of the form (op,arg) or (op,arg1,arg2), etc.. +# op is an "operator" string + +class Expr: + """a class used to model a CPP expression""" + opInteger = "int" + opIdent = "ident" + opCall = "call" + opDefined = "defined" + opTest = "?" + opLogicNot = "!" + opNot = "~" + opNeg = "[-]" + opUnaryPlus = "[+]" + opAdd = "+" + opSub = "-" + opMul = "*" + opDiv = "/" + opMod = "%" + opAnd = "&" + opOr = "|" + opXor = "^" + opLogicAnd = "&&" + opLogicOr = "||" + opEqual = "==" + opNotEqual = "!=" + opLess = "<" + opLessEq = "<=" + opGreater = ">" + opGreaterEq = ">=" + opShl = "<<" + opShr = ">>" + + unaries = [ opLogicNot, opNot, opNeg, opUnaryPlus ] + binaries = [ opAdd, opSub, opMul, opDiv, opMod, opAnd, opOr, opXor, opLogicAnd, opLogicOr, + opEqual, opNotEqual, opLess, opLessEq, opGreater, opGreaterEq ] + + precedences = { + opTest: 0, + opLogicOr: 1, + opLogicNot: 2, + opOr : 3, + opXor: 4, + opAnd: 5, + opEqual: 6, opNotEqual: 6, + opLess:7, opLessEq:7, opGreater:7, opGreaterEq:7, + opShl:8, opShr:8, + opAdd:9, opSub:9, + opMul:10, opDiv:10, opMod:10, + opLogicNot:11, + opNot: 12, + } + + def __init__(self,op): + self.op = op + + def __repr__(self): + return "(%s)" % self.op + + def __str__(self): + return "operator(%s)" % self.op + + def precedence(self): + """return the precedence of a given operator""" + return Expr.precedences.get(self.op, 1000) + + def isUnary(self): + return self.op in Expr.unaries + + def isBinary(self): + return self.op in Expr.binaries + + def isDefined(self): + return self.op is opDefined + + def toInt(self): + """return the integer value of a given expression. only valid for integer expressions + will return None otherwise""" + return None + +class IntExpr(Expr): + def __init__(self,value): + Expr.__init__(self,opInteger) + self.arg = value + + def __repr__(self): + return "(int %s)" % self.arg + + def __str__(self): + return self.arg + + def toInt(self): + s = self.arg # string value + # get rid of U or L suffixes + while len(s) > 0 and s[-1] in "LUlu": + s = s[:-1] + return string.atoi(s) + +class IdentExpr(Expr): + def __init__(self,name): + Expr.__init__(self,opIdent) + self.name = name + + def __repr__(self): + return "(ident %s)" % self.name + + def __str__(self): + return self.name + +class CallExpr(Expr): + def __init__(self,funcname,params): + Expr.__init__(self,opCall) + self.funcname = funcname + self.params = params + + def __repr__(self): + result = "(call %s [" % self.funcname + comma = "" + for param in self.params: + result += "%s%s" % (comma, repr(param)) + comma = "," + result += "])" + return result + + def __str__(self): + result = "%s(" % self.funcname + comma = "" + for param in self.params: + result += "%s%s" % (comma, str(param)) + comma = "," + + result += ")" + return result + +class TestExpr(Expr): + def __init__(self,cond,iftrue,iffalse): + Expr.__init__(self,opTest) + self.cond = cond + self.iftrue = iftrue + self.iffalse = iffalse + + def __repr__(self): + return "(?: %s %s %s)" % (repr(self.cond),repr(self.iftrue),repr(self.iffalse)) + + def __str__(self): + return "(%s) ? (%s) : (%s)" % (self.cond, self.iftrue, self.iffalse) + +class SingleArgExpr(Expr): + def __init__(self,op,arg): + Expr.__init__(self,op) + self.arg = arg + + def __repr__(self): + return "(%s %s)" % (self.op, repr(self.arg)) + +class DefinedExpr(SingleArgExpr): + def __init__(self,op,macroname): + SingleArgExpr.__init__(self.opDefined,macroname) + + def __str__(self): + return "defined(%s)" % self.arg + + +class UnaryExpr(SingleArgExpr): + def __init__(self,op,arg,opstr=None): + SingleArgExpr.__init__(self,op,arg) + if not opstr: + opstr = op + self.opstr = opstr + + def __str__(self): + arg_s = str(self.arg) + arg_prec = self.arg.precedence() + self_prec = self.precedence() + if arg_prec < self_prec: + return "%s(%s)" % (self.opstr,arg_s) + else: + return "%s%s" % (self.opstr, arg_s) + +class TwoArgExpr(Expr): + def __init__(self,op,arg1,arg2): + Expr.__init__(self,op) + self.arg1 = arg1 + self.arg2 = arg2 + + def __repr__(self): + return "(%s %s %s)" % (self.op, repr(self.arg1), repr(self.arg2)) + +class BinaryExpr(TwoArgExpr): + def __init__(self,op,arg1,arg2,opstr=None): + TwoArgExpr.__init__(self,op,arg1,arg2) + if not opstr: + opstr = op + self.opstr = opstr + + def __str__(self): + arg1_s = str(self.arg1) + arg2_s = str(self.arg2) + arg1_prec = self.arg1.precedence() + arg2_prec = self.arg2.precedence() + self_prec = self.precedence() + + result = "" + if arg1_prec < self_prec: + result += "(%s)" % arg1_s + else: + result += arg1_s + + result += " %s " % self.opstr + + if arg2_prec < self_prec: + result += "(%s)" % arg2_s + else: + result += arg2_s + + return result + +##################################################################################### +##################################################################################### +##### ##### +##### C P P E X P R E S S I O N P A R S E R ##### +##### ##### +##################################################################################### +##################################################################################### + + +class ExprParser: + """a class used to convert a list of tokens into a cpp Expr object""" + + re_octal = re.compile(r"\s*\(0[0-7]+\).*") + re_decimal = re.compile(r"\s*\(\d+[ulUL]*\).*") + re_hexadecimal = re.compile(r"\s*\(0[xX][0-9a-fA-F]*\).*") + + def __init__(self,tokens): + self.tok = tokens + self.n = len(self.tok) + self.i = 0 + + def mark(self): + return self.i + + def release(self,pos): + self.i = pos + + def peekId(self): + if self.i < self.n: + return self.tok[self.i].id + return None + + def peek(self): + if self.i < self.n: + return self.tok[self.i] + return None + + def skip(self): + if self.i < self.n: + self.i += 1 + + def skipOptional(self,id): + if self.i < self.n and self.tok[self.i].id == id: + self.i += 1 + + def skipSpaces(self): + i = self.i + n = self.n + tok = self.tok + while i < n and (tok[i] == tokSPACE or tok[i] == tokLN): + i += 1 + self.i = i + + # all the isXXX functions returns a (expr,nextpos) pair if a match is found + # or None if not + + def is_integer(self): + id = self.tok[self.i].id + c = id[0] + if c < '0' or c > '9': + return None + + m = ExprParser.re_octal.match(id) + if m: + return (IntExpr(id), m.end(1)) + + m = ExprParser.re_decimal.match(id) + if m: + return (IntExpr(id), m.end(1)) + + m = ExprParser.re_hexadecimal(id) + if m: + return (IntExpr(id), m.end(1)) + + return None + + def is_defined(self): + id = self.tok[self.i].id + if id != "defined": + return None + + pos = self.mark() + + use_paren = 0 + if self.peekId() == tokLPAREN: + self.skip() + use_paren = 1 + + if self.peekId() != tokIDENT: + self.throw( BadExpectedToken, "identifier expected") + + macroname = self.peek().value + self.skip() + if use_paren: + self.skipSpaces() + if self.peekId() != tokRPAREN: + self.throw( BadExpectedToken, "missing right-paren after 'defined' directive") + self.skip() + + i = self.i + return (DefinedExpr(macroname),i+1) + + def is_call_or_ident(self): + pass + + def parse(self, i): + return None + +##################################################################################### +##################################################################################### +##### ##### +##### C P P E X P R E S S I O N S ##### +##### ##### +##################################################################################### +##################################################################################### + +class CppInvalidExpression(Exception): + """an exception raised when an invalid/unsupported cpp expression is detected""" + pass + +class CppExpr: + """a class that models the condition of #if directives into + an expression tree. each node in the tree is of the form (op,arg) or (op,arg1,arg2) + where "op" is a string describing the operation""" + + unaries = [ "!", "~" ] + binaries = [ "+", "-", "<", "<=", ">=", ">", "&&", "||", "*", "/", "%", "&", "|", "^", "<<", ">>", "==", "!=" ] + precedences = { "||": 1, + "&&": 2, + "|": 3, + "^": 4, + "&": 5, + "==":6, "!=":6, + "<":7, "<=":7, ">":7, ">=":7, + "<<":8, ">>":8, + "+":9, "-":9, + "*":10, "/":10, "%":10, + "!":11, "~":12 + } + + def __init__(self, tokens): + """initialize a CppExpr. 'tokens' must be a CppToken list""" + self.tok = tokens + self.n = len(tokens) + if debugCppExpr: + print "CppExpr: trying to parse %s" % repr(tokens) + expr = self.is_expr(0) + if debugCppExpr: + print "CppExpr: got " + repr(expr) + self.expr = expr[0] + + re_cpp_constant = re.compile(r"((\d|\w|_)+)") + + def throw(self,exception,i,msg): + if i < self.n: + tok = self.tok[i] + print "%d:%d: %s" % (tok.lineno,tok.colno,msg) + else: + print "EOF: %s" % msg + raise exception + + def skip_spaces(self,i): + """skip spaces in input token list""" + while i < self.n: + t = self.tok[i] + if t.id != tokSPACE and t.id != tokLN: + break + i += 1 + return i + + def expectId(self,i,id): + """check that a given token id is at the current position, then skip over it""" + i = self.skip_spaces(i) + if i >= self.n or self.tok[i].id != id: + self.throw(BadExpectedToken,i,"### expecting '%s' in expression, got '%s'" % (id, self.tok[i].id)) + return i+1 + + def expectIdent(self,i): + i = self.skip_spaces(i) + if i >= self.n or self.tok[i].id != tokIDENT: + self.throw(BadExpectedToken,i,"### expecting identifier in expression, got '%s'" % (id, self.tok[i].id)) + return i+1 + + # the is_xxxxx function returns either None or a pair (e,nextpos) + # where 'e' is an expression tuple (e.g. (op,arg)) and 'nextpos' is + # the corresponding next position in the input token list + # + + def is_decimal(self,i): + v = self.tok[i].value[:] + while len(v) > 0 and v[-1] in "ULul": + v = v[:-1] + for digit in v: + if not digit.isdigit(): + return None + + # for an integer expression tuple, the argument + # is simply the value as an integer + val = string.atoi(v) + return ("int", val), i+1 + + def is_hexadecimal(self,i): + v = self.tok[i].value[:] + while len(v) > 0 and v[-1] in "ULul": + v = v[:-1] + if len(v) > 2 and (v[0:2] == "0x" or v[0:2] == "0X"): + for digit in v[2:]: + if not digit in "0123456789abcdefABCDEF": + return None + + # for an hex expression tuple, the argument + # is the value as an integer + val = int(v[2:], 16) + return ("hex", val), i+1 + + return None + + def is_integer(self,i): + if self.tok[i].id != tokNUMBER: + return None + + c = self.is_decimal(i) + if c: return c + + c = self.is_hexadecimal(i) + if c: return c + + return None + + def is_number(self,i): + t = self.tok[i] + if t.id == tokMINUS and i+1 < self.n: + c = self.is_integer(i+1) + if c: + e, i2 = c + op, val = e + return (op, -val), i2 + if t.id == tokPLUS and i+1 < self.n: + c = self.is_integer(i+1) + if c: return c + + return self.is_integer(i) + + + def is_alnum(self,i): + """test wether a given token is alpha-numeric""" + i = self.skip_spaces(i) + if i >= self.n: + return None + t = self.tok[i] + m = CppExpr.re_cpp_constant.match(t.id) + if m: + #print "... alnum '%s'" % m.group(1) + r = m.group(1) + return ("ident", r), i+1 + return None + + def is_defined(self,i): + t = self.tok[i] + if t.id != tokDEFINED: + return None + + # we have the defined keyword, check the rest + i = self.skip_spaces(i+1) + use_parens = 0 + if i < self.n and self.tok[i].id == tokLPAREN: + use_parens = 1 + i = self.skip_spaces(i+1) + + if i >= self.n: + self.throw(CppConstantExpected,i,"### 'defined' must be followed by macro name or left paren") + + t = self.tok[i] + if t.id != tokIDENT: + self.throw(CppConstantExpected,i,"### 'defined' must be followed by macro name") + + i += 1 + if use_parens: + i = self.expectId(i,tokRPAREN) + + return ("defined",t.value), i + + + def is_call_or_ident(self,i): + i = self.skip_spaces(i) + if i >= self.n: + return None + + t = self.tok[i] + if t.id != tokIDENT: + return None + + name = t.value + + i = self.skip_spaces(i+1) + if i >= self.n or self.tok[i].id != tokLPAREN: + return ("ident", name), i + + params = [] + depth = 1 + i += 1 + j = i + while i < self.n: + id = self.tok[i].id + if id == tokLPAREN: + depth += 1 + elif depth == 1 and (id == tokCOMMA or id == tokRPAREN): + while j < i and self.tok[j].id == tokSPACE: + j += 1 + k = i + while k > j and self.tok[k-1].id == tokSPACE: + k -= 1 + param = self.tok[j:k] + params.append( param ) + if id == tokRPAREN: + break + j = i+1 + elif id == tokRPAREN: + depth -= 1 + i += 1 + + if i >= self.n: + return None + + return ("call", (name, params)), i+1 + + def is_token(self,i,token): + i = self.skip_spaces(i) + if i >= self.n or self.tok[i].id != token: + return None + return token, i+1 + + + def is_value(self,i): + t = self.tok[i] + if t.id == tokSTRING: + return ("string", t.value), i+1 + + c = self.is_number(i) + if c: return c + + c = self.is_defined(i) + if c: return c + + c = self.is_call_or_ident(i) + if c: return c + + i = self.skip_spaces(i) + if i >= self.n or self.tok[i].id != tokLPAREN: + return None + + popcount = 1 + i2 = i+1 + while i2 < self.n: + t = self.tok[i2] + if t.id == tokLPAREN: + popcount += 1 + elif t.id == tokRPAREN: + popcount -= 1 + if popcount == 0: + break + i2 += 1 + + if popcount != 0: + self.throw(CppInvalidExpression, i, "expression missing closing parenthesis") + + if debugCppExpr: + print "CppExpr: trying to parse sub-expression %s" % repr(self.tok[i+1:i2]) + oldcount = self.n + self.n = i2 + c = self.is_expr(i+1) + self.n = oldcount + if not c: + self.throw(CppInvalidExpression, i, "invalid expression within parenthesis") + + e, i = c + return e, i2+1 + + def is_unary(self,i): + i = self.skip_spaces(i) + if i >= self.n: + return None + + t = self.tok[i] + if t.id in CppExpr.unaries: + c = self.is_unary(i+1) + if not c: + self.throw(CppInvalidExpression, i, "%s operator must be followed by value" % t.id) + e, i = c + return (t.id, e), i + + return self.is_value(i) + + def is_binary(self,i): + i = self.skip_spaces(i) + if i >= self.n: + return None + + c = self.is_unary(i) + if not c: + return None + + e1, i2 = c + i2 = self.skip_spaces(i2) + if i2 >= self.n: + return c + + t = self.tok[i2] + if t.id in CppExpr.binaries: + c = self.is_binary(i2+1) + if not c: + self.throw(CppInvalidExpression, i,"### %s operator must be followed by value" % t.id ) + e2, i3 = c + return (t.id, e1, e2), i3 + + return None + + def is_expr(self,i): + return self.is_binary(i) + + def dump_node(self,e): + op = e[0] + line = "(" + op + if op == "int": + line += " %d)" % e[1] + elif op == "hex": + line += " 0x%x)" % e[1] + elif op == "ident": + line += " %s)" % e[1] + elif op == "defined": + line += " %s)" % e[1] + elif op == "call": + arg = e[1] + line += " %s [" % arg[0] + prefix = "" + for param in arg[1]: + par = "" + for tok in param: + par += str(tok) + line += "%s%s" % (prefix, par) + prefix = "," + line += "])" + elif op in CppExpr.unaries: + line += " %s)" % self.dump_node(e[1]) + elif op in CppExpr.binaries: + line += " %s %s)" % (self.dump_node(e[1]), self.dump_node(e[2])) + else: + line += " ?%s)" % repr(e[1]) + + return line + + def __repr__(self): + return self.dump_node(self.expr) + + def source_node(self,e): + op = e[0] + if op == "int": + return "%d" % e[1] + if op == "hex": + return "0x%x" % e[1] + if op == "ident": + # XXX: should try to expand + return e[1] + if op == "defined": + return "defined(%s)" % e[1] + + prec = CppExpr.precedences.get(op,1000) + arg = e[1] + if op in CppExpr.unaries: + arg_src = self.source_node(arg) + arg_op = arg[0] + arg_prec = CppExpr.precedences.get(arg[0],1000) + if arg_prec < prec: + return "!(" + arg_src + ")" + else: + return "!" + arg_src + if op in CppExpr.binaries: + arg2 = e[2] + arg1_op = arg[0] + arg2_op = arg2[0] + arg1_src = self.source_node(arg) + arg2_src = self.source_node(arg2) + if CppExpr.precedences.get(arg1_op,1000) < prec: + arg1_src = "(%s)" % arg1_src + if CppExpr.precedences.get(arg2_op,1000) < prec: + arg2_src = "(%s)" % arg2_src + + return "%s %s %s" % (arg1_src, op, arg2_src) + return "???" + + def __str__(self): + return self.source_node(self.expr) + + def int_node(self,e): + if e[0] == "int": + return e[1] + elif e[1] == "hex": + return int(e[1],16) + else: + return None + + def toInt(self): + return self.int_node(self.expr) + + def optimize_node(self,e,macros={}): + op = e[0] + if op == "defined": + name = e[1] + if macros.has_key(name): + if macros[name] == kCppUndefinedMacro: + return ("int", 0) + else: + return ("int", 1) + + if kernel_remove_config_macros and name.startswith("CONFIG_"): + return ("int", 0) + + elif op == "!": + op, v = e + v = self.optimize_node(v, macros) + if v[0] == "int": + if v[1] == 0: + return ("int", 1) + else: + return ("int", 0) + + elif op == "&&": + op, l, r = e + l = self.optimize_node(l, macros) + r = self.optimize_node(r, macros) + li = self.int_node(l) + ri = self.int_node(r) + if li != None: + if li == 0: + return ("int", 0) + else: + return r + + elif op == "||": + op, l, r = e + l = self.optimize_node(l, macros) + r = self.optimize_node(r, macros) + li = self.int_node(l) + ri = self.int_node(r) + if li != None: + if li == 0: + return r + else: + return ("int", 1) + elif ri != None: + if ri == 0: + return l + else: + return ("int", 1) + return e + + def optimize(self,macros={}): + self.expr = self.optimize_node(self.expr,macros) + + def removePrefixedNode(self,e,prefix,names): + op = e[0] + if op == "defined": + name = e[1] + if name.startswith(prefix): + if names.has_key[name] and names[name] == "y": + return ("int", 1) + else: + return ("int", 0) + + elif op in CppExpr.unaries: + op, v = e + v = self.removePrefixedNode(v,prefix,names) + return (op, v) + elif op in CppExpr.binaries: + op, v1, v2 = e + v1 = self.removePrefixedNode(v1,prefix,names) + v2 = self.removePrefixedNode(v2,prefix,names) + return (op, v1, v2) + elif op == "call": + func, params = e[1] + params2 = [] + for param in params: + params2.append( self.removePrefixedNode(param,prefix,names) ) + return (op, (func, params2)) + + return e + + def removePrefixed(self,prefix,names={}): + self.expr = self.removePrefixedNode(self.expr,prefix,names) + + def is_equal_node(self,e1,e2): + if e1[0] != e2[0] or len(e1) != len(e2): + return False + + op = e1[0] + if op == "int" or op == "hex" or op == "!" or op == "defined": + return e1[0] == e2[0] + + return self.is_equal_node(e1[1],e2[1]) and self.is_equal_node(e1[2],e2[2]) + + def is_equal(self,other): + return self.is_equal_node(self.expr,other.expr) + +def test_cpp_expr(expr, expected): + e = CppExpr( CppLineTokenizer( expr ).toTokenList() ) + #print repr(e.expr) + s1 = repr(e) + if s1 != expected: + print "KO: expression '%s' generates '%s', should be '%s'" % (expr, s1, expected) + else: + #print "OK: expression '%s'" % expr + pass + +def test_cpp_expr_optim(expr, expected, macros={}): + e = CppExpr( CppLineTokenizer( expr ).toTokenList() ) + e.optimize(macros) + + s1 = repr(e) + if s1 != expected: + print "KO: optimized expression '%s' generates '%s', should be '%s'" % (expr, s1, expected) + else: + #print "OK: optmized expression '%s'" % expr + pass + +def test_cpp_expr_source(expr, expected): + e = CppExpr( CppLineTokenizer( expr ).toTokenList() ) + s1 = str(e) + if s1 != expected: + print "KO: source expression '%s' generates '%s', should be '%s'" % (expr, s1, expected) + else: + #print "OK: source expression '%s'" % expr + pass + +def test_CppExpr(): + print "testing CppExpr" + test_cpp_expr( "0", "(int 0)" ) + test_cpp_expr( "1", "(int 1)" ) + test_cpp_expr( "1 && 1", "(&& (int 1) (int 1))" ) + test_cpp_expr( "1 && 0", "(&& (int 1) (int 0))" ) + test_cpp_expr( "EXAMPLE", "(ident EXAMPLE)" ) + test_cpp_expr( "EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))" ) + test_cpp_expr( "defined(EXAMPLE)", "(defined EXAMPLE)" ) + test_cpp_expr( "!defined(EXAMPLE)", "(! (defined EXAMPLE))" ) + test_cpp_expr( "defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))" ) + test_cpp_expr( "FOO(BAR)", "(call FOO [BAR])" ) + + test_cpp_expr_optim( "0", "(int 0)" ) + test_cpp_expr_optim( "1", "(int 1)" ) + test_cpp_expr_optim( "1 && 1", "(int 1)" ) + test_cpp_expr_optim( "1 && 0", "(int 0)" ) + test_cpp_expr_optim( "0 && 1", "(int 0)" ) + test_cpp_expr_optim( "0 && 0", "(int 0)" ) + test_cpp_expr_optim( "1 || 1", "(int 1)" ) + test_cpp_expr_optim( "1 || 0", "(int 1)" ) + test_cpp_expr_optim( "0 || 1", "(int 1)" ) + test_cpp_expr_optim( "0 || 0", "(int 0)" ) + test_cpp_expr_optim( "EXAMPLE", "(ident EXAMPLE)" ) + test_cpp_expr_optim( "EXAMPLE - 3", "(- (ident EXAMPLE) (int 3))" ) + test_cpp_expr_optim( "defined(EXAMPLE)", "(defined EXAMPLE)" ) + test_cpp_expr_optim( "defined(EXAMPLE)", "(int 1)", { "EXAMPLE": "XOWOE" } ) + test_cpp_expr_optim( "defined(EXAMPLE)", "(int 0)", { "EXAMPLE": kCppUndefinedMacro} ) + test_cpp_expr_optim( "!defined(EXAMPLE)", "(! (defined EXAMPLE))" ) + test_cpp_expr_optim( "!defined(EXAMPLE)", "(int 0)", { "EXAMPLE" : "XOWOE" } ) + test_cpp_expr_optim( "!defined(EXAMPLE)", "(int 1)", { "EXAMPLE" : kCppUndefinedMacro } ) + test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(|| (defined ABC) (defined BINGO))" ) + test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 1)", { "ABC" : "1" } ) + test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 1)", { "BINGO" : "1" } ) + test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(defined ABC)", { "BINGO" : kCppUndefinedMacro } ) + test_cpp_expr_optim( "defined(ABC) || defined(BINGO)", "(int 0)", { "ABC" : kCppUndefinedMacro, "BINGO" : kCppUndefinedMacro } ) + + test_cpp_expr_source( "0", "0" ) + test_cpp_expr_source( "1", "1" ) + test_cpp_expr_source( "1 && 1", "1 && 1" ) + test_cpp_expr_source( "1 && 0", "1 && 0" ) + test_cpp_expr_source( "0 && 1", "0 && 1" ) + test_cpp_expr_source( "0 && 0", "0 && 0" ) + test_cpp_expr_source( "1 || 1", "1 || 1" ) + test_cpp_expr_source( "1 || 0", "1 || 0" ) + test_cpp_expr_source( "0 || 1", "0 || 1" ) + test_cpp_expr_source( "0 || 0", "0 || 0" ) + test_cpp_expr_source( "EXAMPLE", "EXAMPLE" ) + test_cpp_expr_source( "EXAMPLE - 3", "EXAMPLE - 3" ) + test_cpp_expr_source( "defined(EXAMPLE)", "defined(EXAMPLE)" ) + test_cpp_expr_source( "defined EXAMPLE", "defined(EXAMPLE)" ) + + +##################################################################################### +##################################################################################### +##### ##### +##### C P P B L O C K ##### +##### ##### +##################################################################################### +##################################################################################### + +class Block: + """a class used to model a block of input source text. there are two block types: + - direcive blocks: contain the tokens of a single pre-processor directive (e.g. #if) + - text blocks, contain the tokens of non-directive blocks + + the cpp parser class below will transform an input source file into a list of Block + objects (grouped in a BlockList object for convenience)""" + + def __init__(self,tokens,directive=None,lineno=0): + """initialize a new block, if 'directive' is None, this is a text block + NOTE: this automatically converts '#ifdef MACRO' into '#if defined(MACRO)' + and '#ifndef MACRO' into '#if !defined(MACRO)'""" + if directive == "ifdef": + tok = Token() + tok.set(tokDEFINED) + tokens = [ tok ] + tokens + directive = "if" + + elif directive == "ifndef": + tok1 = Token() + tok2 = Token() + tok1.set(tokNOT) + tok2.set(tokDEFINED) + tokens = [ tok1, tok2 ] + tokens + directive = "if" + + self.tokens = tokens + self.directive = directive + if lineno > 0: + self.lineno = lineno + else: + self.lineno = self.tokens[0].lineno + + if self.isIf(): + self.expr = CppExpr( self.tokens ) + + def isDirective(self): + """returns True iff this is a directive block""" + return self.directive != None + + def isConditional(self): + """returns True iff this is a conditional directive block""" + return self.directive in ["if","ifdef","ifndef","else","elif","endif"] + + def isDefine(self): + """returns the macro name in a #define directive, or None otherwise""" + if self.directive != "define": + return None + + return self.tokens[0].value + + def isIf(self): + """returns True iff this is an #if-like directive block""" + return self.directive in ["if","ifdef","ifndef","elif"] + + def isInclude(self): + """checks wether this is a #include directive. if true, then returns the + corresponding file name (with brackets or double-qoutes). None otherwise""" + if self.directive != "include": + return None + + #print "iii " + repr(self.tokens) + if self.tokens[0].id == tokSTRING: + # a double-quote include, that's easy + return self.tokens[0].value + + # we only want the bracket part, not any comments or junk after it + if self.tokens[0].id == "<": + i = 0 + tok = self.tokens + n = len(tok) + while i < n and tok[i].id != ">": + i += 1 + + if i >= n: + return None + + return string.join([ str(x) for x in tok[:i+1] ],"") + + else: + return None + + def __repr__(self): + """generate the representation of a given block""" + if self.directive: + result = "#%s " % self.directive + if self.isIf(): + result += repr(self.expr) + else: + for tok in self.tokens: + result += repr(tok) + else: + result = "" + for tok in self.tokens: + result += repr(tok) + + return result + + def __str__(self): + """generate the string representation of a given block""" + if self.directive: + if self.directive == "if": + # small optimization to re-generate #ifdef and #ifndef + e = self.expr.expr + op = e[0] + if op == "defined": + result = "#ifdef %s" % e[1] + elif op == "!" and e[1][0] == "defined": + result = "#ifndef %s" % e[1][1] + else: + result = "#if " + str(self.expr) + else: + result = "#%s" % self.directive + if len(self.tokens): + result += " " + for tok in self.tokens: + result += str(tok) + else: + result = "" + for tok in self.tokens: + result += str(tok) + + return result + + +class BlockList: + """a convenience class used to hold and process a list of blocks returned by + the cpp parser""" + def __init__(self,blocks): + self.blocks = blocks + + def __len__(self): + return len(self.blocks) + + def __getitem__(self,n): + return self.blocks[n] + + def __repr__(self): + return repr(self.blocks) + + def __str__(self): + result = "" + for b in self.blocks: + result += str(b) + if b.isDirective(): + result += '\n' + return result + + def optimizeIf01(self): + """remove the code between #if 0 .. #endif in a BlockList""" + self.blocks = optimize_if01(self.blocks) + + def optimizeMacros(self, macros): + """remove known defined and undefined macros from a BlockList""" + for b in self.blocks: + if b.isIf(): + b.expr.optimize(macros) + + def removeMacroDefines(self,macros): + """remove known macro definitions from a BlockList""" + self.blocks = remove_macro_defines(self.blocks,macros) + + def removePrefixed(self,prefix,names): + for b in self.blocks: + if b.isIf(): + b.expr.removePrefixed(prefix,names) + + def optimizeAll(self,macros): + self.optimizeMacros(macros) + self.optimizeIf01() + return + + def findIncludes(self): + """return the list of included files in a BlockList""" + result = [] + for b in self.blocks: + i = b.isInclude() + if i: + result.append(i) + + return result + + + def write(self,out): + out.write(str(self)) + + def removeComments(self): + for b in self.blocks: + for tok in b.tokens: + if tok.id == tokSPACE: + tok.value = " " + + def removeEmptyLines(self): + # state = 1 => previous line was tokLN + # state = 0 => previous line was directive + state = 1 + for b in self.blocks: + if b.isDirective(): + #print "$$$ directive %s" % str(b) + state = 0 + else: + # a tokLN followed by spaces is replaced by a single tokLN + # several successive tokLN are replaced by a single one + # + dst = [] + src = b.tokens + n = len(src) + i = 0 + #print "$$$ parsing %s" % repr(src) + while i < n: + # find final tokLN + j = i + while j < n and src[j].id != tokLN: + j += 1 + + if j >= n: + # uhhh + dst += src[i:] + break + + if src[i].id == tokSPACE: + k = i+1 + while src[k].id == tokSPACE: + k += 1 + + if k == j: # empty lines with spaces in it + i = j # remove the spaces + + if i == j: + # an empty line + if state == 1: + i += 1 # remove it + else: + state = 1 + dst.append(src[i]) + i += 1 + else: + # this line is not empty, remove trailing spaces + k = j + while k > i and src[k-1].id == tokSPACE: + k -= 1 + + nn = i + while nn < k: + dst.append(src[nn]) + nn += 1 + dst.append(src[j]) + state = 0 + i = j+1 + + b.tokens = dst + + def removeVarsAndFuncs(self,knownStatics=set()): + """remove all extern and static declarations corresponding + to variable and function declarations. we only accept typedefs + and enum/structs/union declarations. + + however, we keep the definitions corresponding to the set + of known static inline functions in the set 'knownStatics', + which is useful for optimized byteorder swap functions and + stuff like that. + """ + # state = 1 => typedef/struct encountered + # state = 2 => vars or func declaration encountered, skipping until ";" + # state = 0 => normal (i.e. LN + spaces) + state = 0 + depth = 0 + blocks2 = [] + for b in self.blocks: + if b.isDirective(): + blocks2.append(b) + else: + n = len(b.tokens) + i = 0 + first = 0 + if state == 2: + first = n + while i < n: + tok = b.tokens[i] + if state == 0: + bad = 0 + if tok.id in [tokLN, tokSPACE]: + pass + elif tok.value in [ 'struct', 'typedef', 'enum', 'union', '__extension__' ]: + state = 1 + else: + if tok.value in [ 'static', 'extern', '__KINLINE' ]: + j = i+1 + ident = "" + while j < n and not (b.tokens[j].id in [ '(', ';' ]): + if b.tokens[j].id == tokIDENT: + ident = b.tokens[j].value + j += 1 + if j < n and ident in knownStatics: + # this is a known static, we're going to keep its + # definition in the final output + state = 1 + else: + #print "### skip static '%s'" % ident + pass + + if state == 0: + if i > first: + #print "### intermediate from '%s': '%s'" % (tok.value, repr(b.tokens[first:i])) + blocks2.append( Block(b.tokens[first:i]) ) + state = 2 + first = n + + else: # state > 0 + if tok.id == '{': + depth += 1 + + elif tok.id == '}': + if depth > 0: + depth -= 1 + + elif depth == 0 and tok.id == ';': + if state == 2: + first = i+1 + state = 0 + + i += 1 + + if i > first: + #print "### final '%s'" % repr(b.tokens[first:i]) + blocks2.append( Block(b.tokens[first:i]) ) + + self.blocks = blocks2 + + def insertDisclaimer(self,disclaimer="/* auto-generated file, DO NOT EDIT */"): + """insert your standard issue disclaimer that this is an + auto-generated file, etc..""" + tokens = CppLineTokenizer( disclaimer ).toTokenList() + tokens = tokens[:-1] # remove trailing tokLN + self.blocks = [ Block(tokens) ] + self.blocks + +class BlockParser: + """a class used to convert an input source file into a BlockList object""" + + def __init__(self,tokzer=None): + """initialize a block parser. the input source is provided through a Tokenizer + object""" + self.reset(tokzer) + + def reset(self,tokzer): + self.state = 1 + self.tokzer = tokzer + + def getBlocks(self,tokzer=None): + """tokenize and parse the input source, return a BlockList object + NOTE: empty and line-numbering directives are ignored and removed + from the result. as a consequence, it is possible to have + two successive text blocks in the result""" + # state 0 => in source code + # state 1 => in source code, after a LN + # state 2 => in source code, after LN then some space + state = 1 + lastLN = 0 + current = [] + blocks = [] + + if tokzer == None: + tokzer = self.tokzer + + while 1: + tok = tokzer.getToken() + if tok.id == tokEOF: + break + + if tok.id == tokLN: + state = 1 + current.append(tok) + lastLN = len(current) + + elif tok.id == tokSPACE: + if state == 1: + state = 2 + current.append(tok) + + elif tok.id == "#": + if state > 0: + # this is the start of a directive + + if lastLN > 0: + # record previous tokens as text block + block = Block(current[:lastLN]) + blocks.append(block) + lastLN = 0 + + current = [] + + # skip spaces after the # + while 1: + tok = tokzer.getToken() + if tok.id != tokSPACE: + break + + if tok.id != tokIDENT: + # empty or line-numbering, ignore it + if tok.id != tokLN and tok.id != tokEOF: + while 1: + tok = tokzer.getToken() + if tok.id == tokLN or tok.id == tokEOF: + break + continue + + directive = tok.value + lineno = tok.lineno + + # skip spaces + tok = tokzer.getToken() + while tok.id == tokSPACE: + tok = tokzer.getToken() + + # then record tokens until LN + dirtokens = [] + while tok.id != tokLN and tok.id != tokEOF: + dirtokens.append(tok) + tok = tokzer.getToken() + + block = Block(dirtokens,directive,lineno) + blocks.append(block) + state = 1 + + else: + state = 0 + current.append(tok) + + if len(current) > 0: + block = Block(current) + blocks.append(block) + + return BlockList(blocks) + + def parse(self,tokzer): + return self.getBlocks( tokzer ) + + def parseLines(self,lines): + """parse a list of text lines into a BlockList object""" + return self.getBlocks( CppLinesTokenizer(lines) ) + + def parseFile(self,path): + """parse a file into a BlockList object""" + file = open(path, "rt") + result = self.getBlocks( CppFileTokenizer(file) ) + file.close() + return result + + +def test_block_parsing(lines,expected): + blocks = BlockParser().parse( CppLinesTokenizer(lines) ) + if len(blocks) != len(expected): + raise BadExpectedToken, "parser.buildBlocks returned '%s' expecting '%s'" \ + % (str(blocks), repr(expected)) + for n in range(len(blocks)): + if str(blocks[n]) != expected[n]: + raise BadExpectedToken, "parser.buildBlocks()[%d] is '%s', expecting '%s'" \ + % (n, str(blocks[n]), expected[n]) + #for block in blocks: + # print block + +def test_BlockParser(): + test_block_parsing(["#error hello"],["#error hello"]) + test_block_parsing([ "foo", "", "bar" ], [ "foo\n\nbar\n" ]) + test_block_parsing([ "foo", " # ", "bar" ], [ "foo\n","bar\n" ]) + test_block_parsing(\ + [ "foo", " # ", " # /* ahah */ if defined(__KERNEL__) ", "bar", "#endif" ], + [ "foo\n", "#ifdef __KERNEL__", "bar\n", "#endif" ] ) + + +##################################################################################### +##################################################################################### +##### ##### +##### B L O C K L I S T O P T I M I Z A T I O N ##### +##### ##### +##################################################################################### +##################################################################################### + +def remove_macro_defines( blocks, excludedMacros=set() ): + """remove macro definitions like #define <macroName> ....""" + result = [] + for b in blocks: + macroName = b.isDefine() + if macroName == None or not macroName in excludedMacros: + result.append(b) + + return result + +def find_matching_endif( blocks, i ): + n = len(blocks) + depth = 1 + while i < n: + if blocks[i].isDirective(): + dir = blocks[i].directive + if dir in [ "if", "ifndef", "ifdef" ]: + depth += 1 + elif depth == 1 and dir in [ "else", "elif" ]: + return i + elif dir == "endif": + depth -= 1 + if depth == 0: + return i + i += 1 + return i + +def optimize_if01( blocks ): + """remove the code between #if 0 .. #endif in a list of CppBlocks""" + i = 0 + n = len(blocks) + result = [] + while i < n: + j = i + while j < n and not blocks[j].isIf(): + j += 1 + if j > i: + D2("appending lines %d to %d" % (blocks[i].lineno, blocks[j-1].lineno)) + result += blocks[i:j] + if j >= n: + break + expr = blocks[j].expr + r = expr.toInt() + if r == None: + result.append(blocks[j]) + i = j + 1 + continue + + if r == 0: + # if 0 => skip everything until the corresponding #endif + j = find_matching_endif( blocks, j+1 ) + if j >= n: + # unterminated #if 0, finish here + break + dir = blocks[j].directive + if dir == "endif": + D2("remove 'if 0' .. 'endif' (lines %d to %d)" % (blocks[i].lineno, blocks[j].lineno)) + i = j + 1 + elif dir == "else": + # convert 'else' into 'if 1' + D2("convert 'if 0' .. 'else' into 'if 1' (lines %d to %d)" % (blocks[i].lineno, blocks[j-1].lineno)) + blocks[j].directive = "if" + blocks[j].expr = CppExpr( CppLineTokenizer("1").toTokenList() ) + i = j + elif dir == "elif": + # convert 'elif' into 'if' + D2("convert 'if 0' .. 'elif' into 'if'") + blocks[j].directive = "if" + i = j + continue + + # if 1 => find corresponding endif and remove/transform them + k = find_matching_endif( blocks, j+1 ) + if k >= n: + # unterminated #if 1, finish here + D2("unterminated 'if 1'") + result += blocks[j+1:k] + break + + dir = blocks[k].directive + if dir == "endif": + D2("convert 'if 1' .. 'endif' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno)) + result += optimize_if01(blocks[j+1:k]) + i = k+1 + elif dir == "else": + # convert 'else' into 'if 0' + D2("convert 'if 1' .. 'else' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno)) + result += optimize_if01(blocks[j+1:k]) + blocks[k].directive = "if" + blocks[k].expr = CppExpr( CppLineTokenizer("0").toTokenList() ) + i = k + elif dir == "elif": + # convert 'elif' into 'if 0' + D2("convert 'if 1' .. 'elif' (lines %d to %d)" % (blocks[j].lineno, blocks[k].lineno)) + result += optimize_if01(blocks[j+1:k]) + blocks[k].expr = CppExpr( CppLineTokenizer("0").toTokenList() ) + i = k + return result + +def test_optimizeAll(): + text = """\ +#if 1 +#define GOOD_1 +#endif +#if 0 +#define BAD_2 +#define BAD_3 +#endif + +#if 1 +#define GOOD_2 +#else +#define BAD_4 +#endif + +#if 0 +#define BAD_5 +#else +#define GOOD_3 +#endif + +#if 0 +#if 1 +#define BAD_6 +#endif +#endif\ +""" + + expected = """\ +#define GOOD_1 + +#define GOOD_2 + +#define GOOD_3 + +""" + + print "running test_BlockList.optimizeAll" + out = StringOutput() + lines = string.split(text, '\n') + list = BlockParser().parse( CppLinesTokenizer(lines) ) + #D_setlevel(2) + list.optimizeAll( {"__KERNEL__":kCppUndefinedMacro} ) + #print repr(list) + list.write(out) + if out.get() != expected: + print "KO: macro optimization failed\n" + print "<<<< expecting '", + print expected, + print "'\n>>>> result '" + print out.get(), + print "'\n----" + + +##################################################################################### +##################################################################################### +##### ##### +##### ##### +##### ##### +##################################################################################### +##################################################################################### + +def runUnitTests(): + """run all unit tests for this program""" + print "running unit tests" + test_CppTokenizer() + test_CppExpr() + test_optimizeAll() + test_BlockParser() + print "OK" + +if __name__ == "__main__": + runUnitTests() |