| # Copyright 2015 Google Inc. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """Comment splicer for lib2to3 trees. |
| |
| The lib2to3 syntax tree produced by the parser holds comments and whitespace in |
| prefix attributes of nodes, rather than nodes themselves. This module provides |
| functionality to splice comments out of prefixes and into nodes of their own, |
| making them easier to process. |
| |
| SpliceComments(): the main function exported by this module. |
| """ |
| |
| from lib2to3 import pygram |
| from lib2to3 import pytree |
| from lib2to3.pgen2 import token |
| |
| from yapf.yapflib import pytree_utils |
| |
| |
| def SpliceComments(tree): |
| """Given a pytree, splice comments into nodes of their own right. |
| |
| Extract comments from the prefixes where they are housed after parsing. |
| The prefixes that previously housed the comments become empty. |
| |
| Args: |
| tree: a pytree.Node - the tree to work on. The tree is modified by this |
| function. |
| """ |
| # The previous leaf node encountered in the traversal. |
| # This is a list because Python 2.x doesn't have 'nonlocal' :) |
| prev_leaf = [None] |
| _AnnotateIndents(tree) |
| |
| def _VisitNodeRec(node): |
| # This loop may insert into node.children, so we'll iterate over a copy. |
| for child in node.children[:]: |
| if isinstance(child, pytree.Node): |
| # Nodes don't have prefixes. |
| _VisitNodeRec(child) |
| else: |
| if child.prefix.lstrip().startswith('#'): |
| # We have a comment prefix in this child, so splicing is needed. |
| comment_prefix = child.prefix |
| comment_lineno = child.lineno - comment_prefix.count('\n') |
| comment_column = child.column |
| |
| # Remember the leading indentation of this prefix and clear it. |
| # Mopping up the prefix is important because we may go over this same |
| # child in the next iteration... |
| child_prefix = child.prefix.lstrip('\n') |
| prefix_indent = child_prefix[:child_prefix.find('#')] |
| if '\n' in prefix_indent: |
| prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:] |
| child.prefix = '' |
| |
| if child.type == token.NEWLINE: |
| # If the prefix was on a NEWLINE leaf, it's part of the line so it |
| # will be inserted after the previously encountered leaf. |
| # We can't just insert it before the NEWLINE node, because as a |
| # result of the way pytrees are organized, this node can be under |
| # an inappropriate parent. |
| comment_column -= len(comment_prefix.lstrip()) |
| pytree_utils.InsertNodesAfter( |
| _CreateCommentsFromPrefix( |
| comment_prefix, |
| comment_lineno, |
| comment_column, |
| standalone=False), prev_leaf[0]) |
| elif child.type == token.DEDENT: |
| # Comment prefixes on DEDENT nodes also deserve special treatment, |
| # because their final placement depends on their prefix. |
| # We'll look for an ancestor of this child with a matching |
| # indentation, and insert the comment before it if the ancestor is |
| # on a DEDENT node and after it otherwise. |
| # |
| # lib2to3 places comments that should be separated into the same |
| # DEDENT node. For example, "comment 1" and "comment 2" will be |
| # combined. |
| # |
| # def _(): |
| # for x in y: |
| # pass |
| # # comment 1 |
| # |
| # # comment 2 |
| # pass |
| # |
| # In this case, we need to split them up ourselves. |
| |
| # Split into groups of comments at decreasing levels of indentation |
| comment_groups = [] |
| comment_column = None |
| for cmt in comment_prefix.split('\n'): |
| col = cmt.find('#') |
| if col < 0: |
| if comment_column is None: |
| # Skip empty lines at the top of the first comment group |
| comment_lineno += 1 |
| continue |
| elif comment_column is None or col < comment_column: |
| comment_column = col |
| comment_indent = cmt[:comment_column] |
| comment_groups.append((comment_column, comment_indent, [])) |
| comment_groups[-1][-1].append(cmt) |
| |
| # Insert a node for each group |
| for comment_column, comment_indent, comment_group in comment_groups: |
| ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent) |
| if ancestor_at_indent.type == token.DEDENT: |
| InsertNodes = pytree_utils.InsertNodesBefore # pylint: disable=invalid-name |
| else: |
| InsertNodes = pytree_utils.InsertNodesAfter # pylint: disable=invalid-name |
| InsertNodes( |
| _CreateCommentsFromPrefix( |
| '\n'.join(comment_group) + '\n', |
| comment_lineno, |
| comment_column, |
| standalone=True), ancestor_at_indent) |
| comment_lineno += len(comment_group) |
| else: |
| # Otherwise there are two cases. |
| # |
| # 1. The comment is on its own line |
| # 2. The comment is part of an expression. |
| # |
| # Unfortunately, it's fairly difficult to distinguish between the |
| # two in lib2to3 trees. The algorithm here is to determine whether |
| # child is the first leaf in the statement it belongs to. If it is, |
| # then the comment (which is a prefix) belongs on a separate line. |
| # If it is not, it means the comment is buried deep in the statement |
| # and is part of some expression. |
| stmt_parent = _FindStmtParent(child) |
| |
| for leaf_in_parent in stmt_parent.leaves(): |
| if leaf_in_parent.type == token.NEWLINE: |
| continue |
| elif id(leaf_in_parent) == id(child): |
| # This comment stands on its own line, and it has to be inserted |
| # into the appropriate parent. We'll have to find a suitable |
| # parent to insert into. See comments above |
| # _STANDALONE_LINE_NODES for more details. |
| node_with_line_parent = _FindNodeWithStandaloneLineParent(child) |
| pytree_utils.InsertNodesBefore( |
| _CreateCommentsFromPrefix( |
| comment_prefix, comment_lineno, 0, standalone=True), |
| node_with_line_parent) |
| break |
| else: |
| if comment_lineno == prev_leaf[0].lineno: |
| comment_lines = comment_prefix.splitlines() |
| value = comment_lines[0].lstrip() |
| if value.rstrip('\n'): |
| comment_column = prev_leaf[0].column |
| comment_column += len(prev_leaf[0].value) |
| comment_column += ( |
| len(comment_lines[0]) - len(comment_lines[0].lstrip())) |
| comment_leaf = pytree.Leaf( |
| type=token.COMMENT, |
| value=value.rstrip('\n'), |
| context=('', (comment_lineno, comment_column))) |
| pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0]) |
| comment_prefix = '\n'.join(comment_lines[1:]) |
| comment_lineno += 1 |
| |
| rindex = (0 if '\n' not in comment_prefix.rstrip() else |
| comment_prefix.rstrip().rindex('\n') + 1) |
| comment_column = ( |
| len(comment_prefix[rindex:]) - len( |
| comment_prefix[rindex:].lstrip())) |
| comments = _CreateCommentsFromPrefix( |
| comment_prefix, |
| comment_lineno, |
| comment_column, |
| standalone=False) |
| pytree_utils.InsertNodesBefore(comments, child) |
| break |
| |
| prev_leaf[0] = child |
| |
| _VisitNodeRec(tree) |
| |
| |
| def _CreateCommentsFromPrefix(comment_prefix, |
| comment_lineno, |
| comment_column, |
| standalone=False): |
| """Create pytree nodes to represent the given comment prefix. |
| |
| Args: |
| comment_prefix: (unicode) the text of the comment from the node's prefix. |
| comment_lineno: (int) the line number for the start of the comment. |
| comment_column: (int) the column for the start of the comment. |
| standalone: (bool) determines if the comment is standalone or not. |
| |
| Returns: |
| The simple_stmt nodes if this is a standalone comment, otherwise a list of |
| new COMMENT leafs. The prefix may consist of multiple comment blocks, |
| separated by blank lines. Each block gets its own leaf. |
| """ |
| # The comment is stored in the prefix attribute, with no lineno of its |
| # own. So we only know at which line it ends. To find out at which line it |
| # starts, look at how many newlines the comment itself contains. |
| comments = [] |
| |
| lines = comment_prefix.split('\n') |
| index = 0 |
| while index < len(lines): |
| comment_block = [] |
| while index < len(lines) and lines[index].lstrip().startswith('#'): |
| comment_block.append(lines[index].strip()) |
| index += 1 |
| |
| if comment_block: |
| new_lineno = comment_lineno + index - 1 |
| comment_block[0] = comment_block[0].strip() |
| comment_block[-1] = comment_block[-1].strip() |
| comment_leaf = pytree.Leaf( |
| type=token.COMMENT, |
| value='\n'.join(comment_block), |
| context=('', (new_lineno, comment_column))) |
| comment_node = comment_leaf if not standalone else pytree.Node( |
| pygram.python_symbols.simple_stmt, [comment_leaf]) |
| comments.append(comment_node) |
| |
| while index < len(lines) and not lines[index].lstrip(): |
| index += 1 |
| |
| return comments |
| |
| |
| # "Standalone line nodes" are tree nodes that have to start a new line in Python |
| # code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as |
| # parents of other nodes but can come later in a line. This is a list of |
| # standalone line nodes in the grammar. It is meant to be exhaustive |
| # *eventually*, and we'll modify it with time as we discover more corner cases |
| # in the parse tree. |
| # |
| # When splicing a standalone comment (i.e. a comment that appears on its own |
| # line, not on the same line with other code), it's important to insert it into |
| # an appropriate parent of the node it's attached to. An appropriate parent |
| # is the first "standaline line node" in the parent chain of a node. |
| _STANDALONE_LINE_NODES = frozenset([ |
| 'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt', |
| 'funcdef', 'classdef', 'decorated', 'file_input' |
| ]) |
| |
| |
| def _FindNodeWithStandaloneLineParent(node): |
| """Find a node whose parent is a 'standalone line' node. |
| |
| See the comment above _STANDALONE_LINE_NODES for more details. |
| |
| Arguments: |
| node: node to start from |
| |
| Returns: |
| Suitable node that's either the node itself or one of its ancestors. |
| """ |
| if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES: |
| return node |
| else: |
| # This is guaranteed to terminate because 'file_input' is the root node of |
| # any pytree. |
| return _FindNodeWithStandaloneLineParent(node.parent) |
| |
| |
| # "Statement nodes" are standalone statements. The don't have to start a new |
| # line. |
| _STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES |
| |
| |
| def _FindStmtParent(node): |
| """Find the nearest parent of node that is a statement node. |
| |
| Arguments: |
| node: node to start from |
| |
| Returns: |
| Nearest parent (or node itself, if suitable). |
| """ |
| if pytree_utils.NodeName(node) in _STATEMENT_NODES: |
| return node |
| else: |
| return _FindStmtParent(node.parent) |
| |
| |
| def _FindAncestorAtIndent(node, indent): |
| """Find an ancestor of node with the given indentation. |
| |
| Arguments: |
| node: node to start from. This must not be the tree root. |
| indent: indentation string for the ancestor we're looking for. |
| See _AnnotateIndents for more details. |
| |
| Returns: |
| An ancestor node with suitable indentation. If no suitable ancestor is |
| found, the closest ancestor to the tree root is returned. |
| """ |
| if node.parent.parent is None: |
| # Our parent is the tree root, so there's nowhere else to go. |
| return node |
| |
| # If the parent has an indent annotation, and it's shorter than node's |
| # indent, this is a suitable ancestor. |
| # The reason for "shorter" rather than "equal" is that comments may be |
| # improperly indented (i.e. by three spaces, where surrounding statements |
| # have either zero or two or four), and we don't want to propagate them all |
| # the way to the root. |
| parent_indent = pytree_utils.GetNodeAnnotation( |
| node.parent, pytree_utils.Annotation.CHILD_INDENT) |
| if parent_indent is not None and indent.startswith(parent_indent): |
| return node |
| else: |
| # Keep looking up the tree. |
| return _FindAncestorAtIndent(node.parent, indent) |
| |
| |
| def _AnnotateIndents(tree): |
| """Annotate the tree with child_indent annotations. |
| |
| A child_indent annotation on a node specifies the indentation (as a string, |
| like " ") of its children. It is inferred from the INDENT child of a node. |
| |
| Arguments: |
| tree: root of a pytree. The pytree is modified to add annotations to nodes. |
| |
| Raises: |
| RuntimeError: if the tree is malformed. |
| """ |
| # Annotate the root of the tree with zero indent. |
| if tree.parent is None: |
| pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, |
| '') |
| for child in tree.children: |
| if child.type == token.INDENT: |
| child_indent = pytree_utils.GetNodeAnnotation( |
| tree, pytree_utils.Annotation.CHILD_INDENT) |
| if child_indent is not None and child_indent != child.value: |
| raise RuntimeError('inconsistent indentation for child', (tree, child)) |
| pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, |
| child.value) |
| _AnnotateIndents(child) |