1# Copyright 2015 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import parser
6import symbol
7import sys
8import token
9import tokenize
10
11from py_utils.refactor import offset_token
12
13
14class Snippet(object):
15  """A node in the Python parse tree.
16
17  The Python grammar is defined at:
18  https://docs.python.org/2/reference/grammar.html
19
20  There are two types of Snippets:
21    TokenSnippets are leaf nodes containing actual text.
22    Symbols are internal nodes representing higher-level groupings, and are
23        defined by the left-hand sides of the BNFs in the above link.
24  """
25  @property
26  def type(self):
27    raise NotImplementedError()
28
29  @property
30  def type_name(self):
31    raise NotImplementedError()
32
33  @property
34  def children(self):
35    """Return a list of this node's children."""
36    raise NotImplementedError()
37
38  @property
39  def tokens(self):
40    """Return a tuple of the tokens this Snippet contains."""
41    raise NotImplementedError()
42
43  def PrintTree(self, indent=0, stream=sys.stdout):
44    """Spew a pretty-printed parse tree. Mostly useful for debugging."""
45    raise NotImplementedError()
46
47  def __str__(self):
48    return offset_token.Untokenize(self.tokens)
49
50  def FindAll(self, snippet_type):
51    if isinstance(snippet_type, int):
52      if self.type == snippet_type:
53        yield self
54    else:
55      if isinstance(self, snippet_type):
56        yield self
57
58    for child in self.children:
59      for snippet in child.FindAll(snippet_type):
60        yield snippet
61
62  def FindChild(self, snippet_type, **kwargs):
63    for child in self.children:
64      if isinstance(snippet_type, int):
65        if child.type != snippet_type:
66          continue
67      else:
68        if not isinstance(child, snippet_type):
69          continue
70
71      for attribute, value in kwargs:
72        if getattr(child, attribute) != value:
73          break
74      else:
75        return child
76    raise ValueError('%s is not in %s. Children are: %s' %
77                     (snippet_type, self, self.children))
78
79  def FindChildren(self, snippet_type):
80    if isinstance(snippet_type, int):
81      for child in self.children:
82        if child.type == snippet_type:
83          yield child
84    else:
85      for child in self.children:
86        if isinstance(child, snippet_type):
87          yield child
88
89
90class TokenSnippet(Snippet):
91  """A Snippet containing a list of tokens.
92
93  A list of tokens may start with any number of comments and non-terminating
94  newlines, but must end with a syntactically meaningful token.
95  """
96
97  def __init__(self, token_type, tokens):
98    # For operators and delimiters, the TokenSnippet's type may be more specific
99    # than the type of the constituent token. E.g. the TokenSnippet type is
100    # token.DOT, but the token type is token.OP. This is because the parser
101    # has more context than the tokenizer.
102    self._type = token_type
103    self._tokens = tokens
104    self._modified = False
105
106  @classmethod
107  def Create(cls, token_type, string, offset=(0, 0)):
108    return cls(token_type,
109               [offset_token.OffsetToken(token_type, string, offset)])
110
111  @property
112  def type(self):
113    return self._type
114
115  @property
116  def type_name(self):
117    return token.tok_name[self.type]
118
119  @property
120  def value(self):
121    return self._tokens[-1].string
122
123  @value.setter
124  def value(self, value):
125    self._tokens[-1].string = value
126    self._modified = True
127
128  @property
129  def children(self):
130    return []
131
132  @property
133  def tokens(self):
134    return tuple(self._tokens)
135
136  @property
137  def modified(self):
138    return self._modified
139
140  def PrintTree(self, indent=0, stream=sys.stdout):
141    stream.write(' ' * indent)
142    if not self.tokens:
143      print >> stream, self.type_name
144      return
145
146    print >> stream, '%-4s' % self.type_name, repr(self.tokens[0].string)
147    for tok in self.tokens[1:]:
148      stream.write(' ' * indent)
149      print >> stream, ' ' * max(len(self.type_name), 4), repr(tok.string)
150
151
152class Symbol(Snippet):
153  """A Snippet containing sub-Snippets.
154
155  The possible types and type_names are defined in Python's symbol module."""
156
157  def __init__(self, symbol_type, children):
158    self._type = symbol_type
159    self._children = children
160
161  @property
162  def type(self):
163    return self._type
164
165  @property
166  def type_name(self):
167    return symbol.sym_name[self.type]
168
169  @property
170  def children(self):
171    return self._children
172
173  @children.setter
174  def children(self, value):  # pylint: disable=arguments-differ
175    self._children = value
176
177  @property
178  def tokens(self):
179    tokens = []
180    for child in self.children:
181      tokens += child.tokens
182    return tuple(tokens)
183
184  @property
185  def modified(self):
186    return any(child.modified for child in self.children)
187
188  def PrintTree(self, indent=0, stream=sys.stdout):
189    stream.write(' ' * indent)
190
191    # If there's only one child, collapse it onto the same line.
192    node = self
193    while len(node.children) == 1 and len(node.children[0].children) == 1:
194      print >> stream, node.type_name,
195      node = node.children[0]
196
197    print >> stream, node.type_name
198    for child in node.children:
199      child.PrintTree(indent + 2, stream)
200
201
202def Snippetize(f):
203  """Return the syntax tree of the given file."""
204  f.seek(0)
205  syntax_tree = parser.st2list(parser.suite(f.read()))
206  tokens = offset_token.Tokenize(f)
207
208  snippet = _SnippetizeNode(syntax_tree, tokens)
209  assert not tokens
210  return snippet
211
212
213def _SnippetizeNode(node, tokens):
214  # The parser module gives a syntax tree that discards comments,
215  # non-terminating newlines, and whitespace information. Use the tokens given
216  # by the tokenize module to annotate the syntax tree with the information
217  # needed to exactly reproduce the original source code.
218  node_type = node[0]
219
220  if node_type >= token.NT_OFFSET:
221    # Symbol.
222    children = tuple(_SnippetizeNode(child, tokens) for child in node[1:])
223    return Symbol(node_type, children)
224  else:
225    # Token.
226    grabbed_tokens = []
227    while tokens and (
228        tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL):
229      grabbed_tokens.append(tokens.popleft())
230
231    # parser has 2 NEWLINEs right before the end.
232    # tokenize has 0 or 1 depending on if the file has one.
233    # Create extra nodes without consuming tokens to account for this.
234    if node_type == token.NEWLINE:
235      for tok in tokens:
236        if tok.type == token.ENDMARKER:
237          return TokenSnippet(node_type, grabbed_tokens)
238        if tok.type != token.DEDENT:
239          break
240
241    assert tokens[0].type == token.OP or node_type == tokens[0].type
242
243    grabbed_tokens.append(tokens.popleft())
244    return TokenSnippet(node_type, grabbed_tokens)
245