1# Copyright 2015 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import parser 6import symbol 7import sys 8import token 9import tokenize 10 11from py_utils.refactor import offset_token 12 13 14class Snippet(object): 15 """A node in the Python parse tree. 16 17 The Python grammar is defined at: 18 https://docs.python.org/2/reference/grammar.html 19 20 There are two types of Snippets: 21 TokenSnippets are leaf nodes containing actual text. 22 Symbols are internal nodes representing higher-level groupings, and are 23 defined by the left-hand sides of the BNFs in the above link. 24 """ 25 @property 26 def type(self): 27 raise NotImplementedError() 28 29 @property 30 def type_name(self): 31 raise NotImplementedError() 32 33 @property 34 def children(self): 35 """Return a list of this node's children.""" 36 raise NotImplementedError() 37 38 @property 39 def tokens(self): 40 """Return a tuple of the tokens this Snippet contains.""" 41 raise NotImplementedError() 42 43 def PrintTree(self, indent=0, stream=sys.stdout): 44 """Spew a pretty-printed parse tree. Mostly useful for debugging.""" 45 raise NotImplementedError() 46 47 def __str__(self): 48 return offset_token.Untokenize(self.tokens) 49 50 def FindAll(self, snippet_type): 51 if isinstance(snippet_type, int): 52 if self.type == snippet_type: 53 yield self 54 else: 55 if isinstance(self, snippet_type): 56 yield self 57 58 for child in self.children: 59 for snippet in child.FindAll(snippet_type): 60 yield snippet 61 62 def FindChild(self, snippet_type, **kwargs): 63 for child in self.children: 64 if isinstance(snippet_type, int): 65 if child.type != snippet_type: 66 continue 67 else: 68 if not isinstance(child, snippet_type): 69 continue 70 71 for attribute, value in kwargs: 72 if getattr(child, attribute) != value: 73 break 74 else: 75 return child 76 raise ValueError('%s is not in %s. Children are: %s' % 77 (snippet_type, self, self.children)) 78 79 def FindChildren(self, snippet_type): 80 if isinstance(snippet_type, int): 81 for child in self.children: 82 if child.type == snippet_type: 83 yield child 84 else: 85 for child in self.children: 86 if isinstance(child, snippet_type): 87 yield child 88 89 90class TokenSnippet(Snippet): 91 """A Snippet containing a list of tokens. 92 93 A list of tokens may start with any number of comments and non-terminating 94 newlines, but must end with a syntactically meaningful token. 95 """ 96 97 def __init__(self, token_type, tokens): 98 # For operators and delimiters, the TokenSnippet's type may be more specific 99 # than the type of the constituent token. E.g. the TokenSnippet type is 100 # token.DOT, but the token type is token.OP. This is because the parser 101 # has more context than the tokenizer. 102 self._type = token_type 103 self._tokens = tokens 104 self._modified = False 105 106 @classmethod 107 def Create(cls, token_type, string, offset=(0, 0)): 108 return cls(token_type, 109 [offset_token.OffsetToken(token_type, string, offset)]) 110 111 @property 112 def type(self): 113 return self._type 114 115 @property 116 def type_name(self): 117 return token.tok_name[self.type] 118 119 @property 120 def value(self): 121 return self._tokens[-1].string 122 123 @value.setter 124 def value(self, value): 125 self._tokens[-1].string = value 126 self._modified = True 127 128 @property 129 def children(self): 130 return [] 131 132 @property 133 def tokens(self): 134 return tuple(self._tokens) 135 136 @property 137 def modified(self): 138 return self._modified 139 140 def PrintTree(self, indent=0, stream=sys.stdout): 141 stream.write(' ' * indent) 142 if not self.tokens: 143 print >> stream, self.type_name 144 return 145 146 print >> stream, '%-4s' % self.type_name, repr(self.tokens[0].string) 147 for tok in self.tokens[1:]: 148 stream.write(' ' * indent) 149 print >> stream, ' ' * max(len(self.type_name), 4), repr(tok.string) 150 151 152class Symbol(Snippet): 153 """A Snippet containing sub-Snippets. 154 155 The possible types and type_names are defined in Python's symbol module.""" 156 157 def __init__(self, symbol_type, children): 158 self._type = symbol_type 159 self._children = children 160 161 @property 162 def type(self): 163 return self._type 164 165 @property 166 def type_name(self): 167 return symbol.sym_name[self.type] 168 169 @property 170 def children(self): 171 return self._children 172 173 @children.setter 174 def children(self, value): # pylint: disable=arguments-differ 175 self._children = value 176 177 @property 178 def tokens(self): 179 tokens = [] 180 for child in self.children: 181 tokens += child.tokens 182 return tuple(tokens) 183 184 @property 185 def modified(self): 186 return any(child.modified for child in self.children) 187 188 def PrintTree(self, indent=0, stream=sys.stdout): 189 stream.write(' ' * indent) 190 191 # If there's only one child, collapse it onto the same line. 192 node = self 193 while len(node.children) == 1 and len(node.children[0].children) == 1: 194 print >> stream, node.type_name, 195 node = node.children[0] 196 197 print >> stream, node.type_name 198 for child in node.children: 199 child.PrintTree(indent + 2, stream) 200 201 202def Snippetize(f): 203 """Return the syntax tree of the given file.""" 204 f.seek(0) 205 syntax_tree = parser.st2list(parser.suite(f.read())) 206 tokens = offset_token.Tokenize(f) 207 208 snippet = _SnippetizeNode(syntax_tree, tokens) 209 assert not tokens 210 return snippet 211 212 213def _SnippetizeNode(node, tokens): 214 # The parser module gives a syntax tree that discards comments, 215 # non-terminating newlines, and whitespace information. Use the tokens given 216 # by the tokenize module to annotate the syntax tree with the information 217 # needed to exactly reproduce the original source code. 218 node_type = node[0] 219 220 if node_type >= token.NT_OFFSET: 221 # Symbol. 222 children = tuple(_SnippetizeNode(child, tokens) for child in node[1:]) 223 return Symbol(node_type, children) 224 else: 225 # Token. 226 grabbed_tokens = [] 227 while tokens and ( 228 tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL): 229 grabbed_tokens.append(tokens.popleft()) 230 231 # parser has 2 NEWLINEs right before the end. 232 # tokenize has 0 or 1 depending on if the file has one. 233 # Create extra nodes without consuming tokens to account for this. 234 if node_type == token.NEWLINE: 235 for tok in tokens: 236 if tok.type == token.ENDMARKER: 237 return TokenSnippet(node_type, grabbed_tokens) 238 if tok.type != token.DEDENT: 239 break 240 241 assert tokens[0].type == token.OP or node_type == tokens[0].type 242 243 grabbed_tokens.append(tokens.popleft()) 244 return TokenSnippet(node_type, grabbed_tokens) 245