1# Copyright 2015 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Comment splicer for lib2to3 trees.
15
16The lib2to3 syntax tree produced by the parser holds comments and whitespace in
17prefix attributes of nodes, rather than nodes themselves. This module provides
18functionality to splice comments out of prefixes and into nodes of their own,
19making them easier to process.
20
21  SpliceComments(): the main function exported by this module.
22"""
23
24from lib2to3 import pygram
25from lib2to3 import pytree
26from lib2to3.pgen2 import token
27
28from yapf.yapflib import pytree_utils
29
30
31def SpliceComments(tree):
32  """Given a pytree, splice comments into nodes of their own right.
33
34  Extract comments from the prefixes where they are housed after parsing.
35  The prefixes that previously housed the comments become empty.
36
37  Args:
38    tree: a pytree.Node - the tree to work on. The tree is modified by this
39        function.
40  """
41  # The previous leaf node encountered in the traversal.
42  # This is a list because Python 2.x doesn't have 'nonlocal' :)
43  prev_leaf = [None]
44  _AnnotateIndents(tree)
45
46  def _VisitNodeRec(node):
47    # This loop may insert into node.children, so we'll iterate over a copy.
48    for child in node.children[:]:
49      if isinstance(child, pytree.Node):
50        # Nodes don't have prefixes.
51        _VisitNodeRec(child)
52      else:
53        if child.prefix.lstrip().startswith('#'):
54          # We have a comment prefix in this child, so splicing is needed.
55          comment_prefix = child.prefix
56          comment_lineno = child.lineno - comment_prefix.count('\n')
57          comment_column = child.column
58
59          # Remember the leading indentation of this prefix and clear it.
60          # Mopping up the prefix is important because we may go over this same
61          # child in the next iteration...
62          child_prefix = child.prefix.lstrip('\n')
63          prefix_indent = child_prefix[:child_prefix.find('#')]
64          if '\n' in prefix_indent:
65            prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:]
66          child.prefix = ''
67
68          if child.type == token.NEWLINE:
69            # If the prefix was on a NEWLINE leaf, it's part of the line so it
70            # will be inserted after the previously encountered leaf.
71            # We can't just insert it before the NEWLINE node, because as a
72            # result of the way pytrees are organized, this node can be under
73            # an inappropriate parent.
74            comment_column -= len(comment_prefix.lstrip())
75            pytree_utils.InsertNodesAfter(
76                _CreateCommentsFromPrefix(
77                    comment_prefix,
78                    comment_lineno,
79                    comment_column,
80                    standalone=False), prev_leaf[0])
81          elif child.type == token.DEDENT:
82            # Comment prefixes on DEDENT nodes also deserve special treatment,
83            # because their final placement depends on their prefix.
84            # We'll look for an ancestor of this child with a matching
85            # indentation, and insert the comment before it if the ancestor is
86            # on a DEDENT node and after it otherwise.
87            #
88            # lib2to3 places comments that should be separated into the same
89            # DEDENT node. For example, "comment 1" and "comment 2" will be
90            # combined.
91            #
92            #   def _():
93            #     for x in y:
94            #       pass
95            #       # comment 1
96            #
97            #     # comment 2
98            #     pass
99            #
100            # In this case, we need to split them up ourselves.
101
102            # Split into groups of comments at decreasing levels of indentation
103            comment_groups = []
104            comment_column = None
105            for cmt in comment_prefix.split('\n'):
106              col = cmt.find('#')
107              if col < 0:
108                if comment_column is None:
109                  # Skip empty lines at the top of the first comment group
110                  comment_lineno += 1
111                  continue
112              elif comment_column is None or col < comment_column:
113                comment_column = col
114                comment_indent = cmt[:comment_column]
115                comment_groups.append((comment_column, comment_indent, []))
116              comment_groups[-1][-1].append(cmt)
117
118            # Insert a node for each group
119            for comment_column, comment_indent, comment_group in comment_groups:
120              ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent)
121              if ancestor_at_indent.type == token.DEDENT:
122                InsertNodes = pytree_utils.InsertNodesBefore  # pylint: disable=invalid-name
123              else:
124                InsertNodes = pytree_utils.InsertNodesAfter  # pylint: disable=invalid-name
125              InsertNodes(
126                  _CreateCommentsFromPrefix(
127                      '\n'.join(comment_group) + '\n',
128                      comment_lineno,
129                      comment_column,
130                      standalone=True), ancestor_at_indent)
131              comment_lineno += len(comment_group)
132          else:
133            # Otherwise there are two cases.
134            #
135            # 1. The comment is on its own line
136            # 2. The comment is part of an expression.
137            #
138            # Unfortunately, it's fairly difficult to distinguish between the
139            # two in lib2to3 trees. The algorithm here is to determine whether
140            # child is the first leaf in the statement it belongs to. If it is,
141            # then the comment (which is a prefix) belongs on a separate line.
142            # If it is not, it means the comment is buried deep in the statement
143            # and is part of some expression.
144            stmt_parent = _FindStmtParent(child)
145
146            for leaf_in_parent in stmt_parent.leaves():
147              if leaf_in_parent.type == token.NEWLINE:
148                continue
149              elif id(leaf_in_parent) == id(child):
150                # This comment stands on its own line, and it has to be inserted
151                # into the appropriate parent. We'll have to find a suitable
152                # parent to insert into. See comments above
153                # _STANDALONE_LINE_NODES for more details.
154                node_with_line_parent = _FindNodeWithStandaloneLineParent(child)
155                pytree_utils.InsertNodesBefore(
156                    _CreateCommentsFromPrefix(
157                        comment_prefix, comment_lineno, 0, standalone=True),
158                    node_with_line_parent)
159                break
160              else:
161                if comment_lineno == prev_leaf[0].lineno:
162                  comment_lines = comment_prefix.splitlines()
163                  value = comment_lines[0].lstrip()
164                  if value.rstrip('\n'):
165                    comment_column = prev_leaf[0].column
166                    comment_column += len(prev_leaf[0].value)
167                    comment_column += (
168                        len(comment_lines[0]) - len(comment_lines[0].lstrip()))
169                    comment_leaf = pytree.Leaf(
170                        type=token.COMMENT,
171                        value=value.rstrip('\n'),
172                        context=('', (comment_lineno, comment_column)))
173                    pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0])
174                    comment_prefix = '\n'.join(comment_lines[1:])
175                    comment_lineno += 1
176
177                rindex = (0 if '\n' not in comment_prefix.rstrip() else
178                          comment_prefix.rstrip().rindex('\n') + 1)
179                comment_column = (
180                    len(comment_prefix[rindex:]) - len(
181                        comment_prefix[rindex:].lstrip()))
182                comments = _CreateCommentsFromPrefix(
183                    comment_prefix,
184                    comment_lineno,
185                    comment_column,
186                    standalone=False)
187                pytree_utils.InsertNodesBefore(comments, child)
188                break
189
190        prev_leaf[0] = child
191
192  _VisitNodeRec(tree)
193
194
195def _CreateCommentsFromPrefix(comment_prefix,
196                              comment_lineno,
197                              comment_column,
198                              standalone=False):
199  """Create pytree nodes to represent the given comment prefix.
200
201  Args:
202    comment_prefix: (unicode) the text of the comment from the node's prefix.
203    comment_lineno: (int) the line number for the start of the comment.
204    comment_column: (int) the column for the start of the comment.
205    standalone: (bool) determines if the comment is standalone or not.
206
207  Returns:
208    The simple_stmt nodes if this is a standalone comment, otherwise a list of
209    new COMMENT leafs. The prefix may consist of multiple comment blocks,
210    separated by blank lines. Each block gets its own leaf.
211  """
212  # The comment is stored in the prefix attribute, with no lineno of its
213  # own. So we only know at which line it ends. To find out at which line it
214  # starts, look at how many newlines the comment itself contains.
215  comments = []
216
217  lines = comment_prefix.split('\n')
218  index = 0
219  while index < len(lines):
220    comment_block = []
221    while index < len(lines) and lines[index].lstrip().startswith('#'):
222      comment_block.append(lines[index].strip())
223      index += 1
224
225    if comment_block:
226      new_lineno = comment_lineno + index - 1
227      comment_block[0] = comment_block[0].strip()
228      comment_block[-1] = comment_block[-1].strip()
229      comment_leaf = pytree.Leaf(
230          type=token.COMMENT,
231          value='\n'.join(comment_block),
232          context=('', (new_lineno, comment_column)))
233      comment_node = comment_leaf if not standalone else pytree.Node(
234          pygram.python_symbols.simple_stmt, [comment_leaf])
235      comments.append(comment_node)
236
237    while index < len(lines) and not lines[index].lstrip():
238      index += 1
239
240  return comments
241
242
243# "Standalone line nodes" are tree nodes that have to start a new line in Python
244# code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as
245# parents of other nodes but can come later in a line. This is a list of
246# standalone line nodes in the grammar. It is meant to be exhaustive
247# *eventually*, and we'll modify it with time as we discover more corner cases
248# in the parse tree.
249#
250# When splicing a standalone comment (i.e. a comment that appears on its own
251# line, not on the same line with other code), it's important to insert it into
252# an appropriate parent of the node it's attached to. An appropriate parent
253# is the first "standaline line node" in the parent chain of a node.
254_STANDALONE_LINE_NODES = frozenset([
255    'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt',
256    'funcdef', 'classdef', 'decorated', 'file_input'
257])
258
259
260def _FindNodeWithStandaloneLineParent(node):
261  """Find a node whose parent is a 'standalone line' node.
262
263  See the comment above _STANDALONE_LINE_NODES for more details.
264
265  Arguments:
266    node: node to start from
267
268  Returns:
269    Suitable node that's either the node itself or one of its ancestors.
270  """
271  if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES:
272    return node
273  else:
274    # This is guaranteed to terminate because 'file_input' is the root node of
275    # any pytree.
276    return _FindNodeWithStandaloneLineParent(node.parent)
277
278
279# "Statement nodes" are standalone statements. The don't have to start a new
280# line.
281_STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES
282
283
284def _FindStmtParent(node):
285  """Find the nearest parent of node that is a statement node.
286
287  Arguments:
288    node: node to start from
289
290  Returns:
291    Nearest parent (or node itself, if suitable).
292  """
293  if pytree_utils.NodeName(node) in _STATEMENT_NODES:
294    return node
295  else:
296    return _FindStmtParent(node.parent)
297
298
299def _FindAncestorAtIndent(node, indent):
300  """Find an ancestor of node with the given indentation.
301
302  Arguments:
303    node: node to start from. This must not be the tree root.
304    indent: indentation string for the ancestor we're looking for.
305        See _AnnotateIndents for more details.
306
307  Returns:
308    An ancestor node with suitable indentation. If no suitable ancestor is
309    found, the closest ancestor to the tree root is returned.
310  """
311  if node.parent.parent is None:
312    # Our parent is the tree root, so there's nowhere else to go.
313    return node
314
315  # If the parent has an indent annotation, and it's shorter than node's
316  # indent, this is a suitable ancestor.
317  # The reason for "shorter" rather than "equal" is that comments may be
318  # improperly indented (i.e. by three spaces, where surrounding statements
319  # have either zero or two or four), and we don't want to propagate them all
320  # the way to the root.
321  parent_indent = pytree_utils.GetNodeAnnotation(
322      node.parent, pytree_utils.Annotation.CHILD_INDENT)
323  if parent_indent is not None and indent.startswith(parent_indent):
324    return node
325  else:
326    # Keep looking up the tree.
327    return _FindAncestorAtIndent(node.parent, indent)
328
329
330def _AnnotateIndents(tree):
331  """Annotate the tree with child_indent annotations.
332
333  A child_indent annotation on a node specifies the indentation (as a string,
334  like "  ") of its children. It is inferred from the INDENT child of a node.
335
336  Arguments:
337    tree: root of a pytree. The pytree is modified to add annotations to nodes.
338
339  Raises:
340    RuntimeError: if the tree is malformed.
341  """
342  # Annotate the root of the tree with zero indent.
343  if tree.parent is None:
344    pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
345                                   '')
346  for child in tree.children:
347    if child.type == token.INDENT:
348      child_indent = pytree_utils.GetNodeAnnotation(
349          tree, pytree_utils.Annotation.CHILD_INDENT)
350      if child_indent is not None and child_indent != child.value:
351        raise RuntimeError('inconsistent indentation for child', (tree, child))
352      pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
353                                     child.value)
354    _AnnotateIndents(child)
355