1# Copyright 2015 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""UnwrappedLine primitive for formatting.
15
16An unwrapped line is the containing data structure produced by the parser. It
17collects all nodes (stored in FormatToken objects) that could appear on a
18single line if there were no line length restrictions. It's then used by the
19parser to perform the wrapping required to comply with the style guide.
20"""
21
22from yapf.yapflib import format_token
23from yapf.yapflib import py3compat
24from yapf.yapflib import pytree_utils
25from yapf.yapflib import split_penalty
26from yapf.yapflib import style
27
28
29class UnwrappedLine(object):
30  """Represents a single unwrapped line in the output.
31
32  Attributes:
33    depth: indentation depth of this line. This is just a numeric value used to
34      distinguish lines that are more deeply nested than others. It is not the
35      actual amount of spaces, which is style-dependent.
36  """
37
38  def __init__(self, depth, tokens=None):
39    """Constructor.
40
41    Creates a new unwrapped line with the given depth an initial list of tokens.
42    Constructs the doubly-linked lists for format tokens using their built-in
43    next_token and previous_token attributes.
44
45    Arguments:
46      depth: indentation depth of this line
47      tokens: initial list of tokens
48    """
49    self.depth = depth
50    self._tokens = tokens or []
51    self.disable = False
52
53    if self._tokens:
54      # Set up a doubly linked list.
55      for index, tok in enumerate(self._tokens[1:]):
56        # Note, 'index' is the index to the previous token.
57        tok.previous_token = self._tokens[index]
58        self._tokens[index].next_token = tok
59
60  def CalculateFormattingInformation(self):
61    """Calculate the split penalty and total length for the tokens."""
62    # Say that the first token in the line should have a space before it. This
63    # means only that if this unwrapped line is joined with a predecessor line,
64    # then there will be a space between them.
65    self.first.spaces_required_before = 1
66    self.first.total_length = len(self.first.value)
67
68    prev_token = self.first
69    prev_length = self.first.total_length
70    for token in self._tokens[1:]:
71      if (token.spaces_required_before == 0 and
72          _SpaceRequiredBetween(prev_token, token)):
73        token.spaces_required_before = 1
74
75      tok_len = len(token.value) if not token.is_pseudo_paren else 0
76      token.total_length = prev_length + tok_len + token.spaces_required_before
77
78      # The split penalty has to be computed before {must|can}_break_before,
79      # because these may use it for their decision.
80      token.split_penalty += _SplitPenalty(prev_token, token)
81      token.must_break_before = _MustBreakBefore(prev_token, token)
82      token.can_break_before = (
83          token.must_break_before or _CanBreakBefore(prev_token, token))
84
85      prev_length = token.total_length
86      prev_token = token
87
88  def Split(self):
89    """Split the line at semicolons."""
90    if not self.has_semicolon or self.disable:
91      return [self]
92
93    uwlines = []
94    uwline = UnwrappedLine(self.depth)
95    for tok in self._tokens:
96      if tok.value == ';':
97        uwlines.append(uwline)
98        uwline = UnwrappedLine(self.depth)
99      else:
100        uwline.AppendToken(tok)
101
102    if uwline.tokens:
103      uwlines.append(uwline)
104
105    for uwline in uwlines:
106      pytree_utils.SetNodeAnnotation(uwline.first.node,
107                                     pytree_utils.Annotation.MUST_SPLIT, True)
108      uwline.first.previous_token = None
109      uwline.last.next_token = None
110
111    return uwlines
112
113  ############################################################################
114  # Token Access and Manipulation Methods                                    #
115  ############################################################################
116
117  def AppendToken(self, token):
118    """Append a new FormatToken to the tokens contained in this line."""
119    if self._tokens:
120      token.previous_token = self.last
121      self.last.next_token = token
122    self._tokens.append(token)
123
124  def AppendNode(self, node):
125    """Convenience method to append a pytree node directly.
126
127    Wraps the node with a FormatToken.
128
129    Arguments:
130      node: the node to append
131    """
132    self.AppendToken(format_token.FormatToken(node))
133
134  @property
135  def first(self):
136    """Returns the first non-whitespace token."""
137    return self._tokens[0]
138
139  @property
140  def last(self):
141    """Returns the last non-whitespace token."""
142    return self._tokens[-1]
143
144  ############################################################################
145  # Token -> String Methods                                                  #
146  ############################################################################
147
148  def AsCode(self, indent_per_depth=2):
149    """Return a "code" representation of this line.
150
151    The code representation shows how the line would be printed out as code.
152
153    TODO(eliben): for now this is rudimentary for debugging - once we add
154    formatting capabilities, this method will have other uses (not all tokens
155    have spaces around them, for example).
156
157    Arguments:
158      indent_per_depth: how much spaces to indend per depth level.
159
160    Returns:
161      A string representing the line as code.
162    """
163    indent = ' ' * indent_per_depth * self.depth
164    tokens_str = ' '.join(tok.value for tok in self._tokens)
165    return indent + tokens_str
166
167  def __str__(self):  # pragma: no cover
168    return self.AsCode()
169
170  def __repr__(self):  # pragma: no cover
171    tokens_repr = ','.join(
172        ['{0}({1!r})'.format(tok.name, tok.value) for tok in self._tokens])
173    return 'UnwrappedLine(depth={0}, tokens=[{1}])'.format(
174        self.depth, tokens_repr)
175
176  ############################################################################
177  # Properties                                                               #
178  ############################################################################
179
180  @property
181  def tokens(self):
182    """Access the tokens contained within this line.
183
184    The caller must not modify the tokens list returned by this method.
185
186    Returns:
187      List of tokens in this line.
188    """
189    return self._tokens
190
191  @property
192  def lineno(self):
193    """Return the line number of this unwrapped line.
194
195    Returns:
196      The line number of the first token in this unwrapped line.
197    """
198    return self.first.lineno
199
200  @property
201  def is_comment(self):
202    return self.first.is_comment
203
204  @property
205  def has_semicolon(self):
206    return any(tok.value == ';' for tok in self._tokens)
207
208
209def _IsIdNumberStringToken(tok):
210  return tok.is_keyword or tok.is_name or tok.is_number or tok.is_string
211
212
213def _IsUnaryOperator(tok):
214  return format_token.Subtype.UNARY_OPERATOR in tok.subtypes
215
216
217def _SpaceRequiredBetween(left, right):
218  """Return True if a space is required between the left and right token."""
219  lval = left.value
220  rval = right.value
221  if (left.is_pseudo_paren and _IsIdNumberStringToken(right) and
222      left.previous_token and _IsIdNumberStringToken(left.previous_token)):
223    # Space between keyword... tokens and pseudo parens.
224    return True
225  if left.is_pseudo_paren or right.is_pseudo_paren:
226    # There should be a space after the ':' in a dictionary.
227    if left.OpensScope():
228      return True
229    # The closing pseudo-paren shouldn't affect spacing.
230    return False
231  if left.is_continuation or right.is_continuation:
232    # The continuation node's value has all of the spaces it needs.
233    return False
234  if right.name in pytree_utils.NONSEMANTIC_TOKENS:
235    # No space before a non-semantic token.
236    return False
237  if _IsIdNumberStringToken(left) and _IsIdNumberStringToken(right):
238    # Spaces between keyword, string, number, and identifier tokens.
239    return True
240  if lval == ',' and rval == ':':
241    # We do want a space between a comma and colon.
242    return True
243  if rval in ':,':
244    # Otherwise, we never want a space before a colon or comma.
245    return False
246  if lval == ',' and rval in ']})':
247    # Add a space between ending ',' and closing bracket if requested.
248    return style.Get('SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET')
249  if lval == ',':
250    # We want a space after a comma.
251    return True
252  if lval == 'from' and rval == '.':
253    # Space before the '.' in an import statement.
254    return True
255  if lval == '.' and rval == 'import':
256    # Space after the '.' in an import statement.
257    return True
258  if (lval == '=' and rval == '.' and
259      format_token.Subtype.DEFAULT_OR_NAMED_ASSIGN not in left.subtypes):
260    # Space between equal and '.' as in "X = ...".
261    return True
262  if ((right.is_keyword or right.is_name) and
263      (left.is_keyword or left.is_name)):
264    # Don't merge two keywords/identifiers.
265    return True
266  if (format_token.Subtype.SUBSCRIPT_COLON in left.subtypes or
267      format_token.Subtype.SUBSCRIPT_COLON in right.subtypes):
268    # A subscript shouldn't have spaces separating its colons.
269    return False
270  if (format_token.Subtype.TYPED_NAME in left.subtypes or
271      format_token.Subtype.TYPED_NAME in right.subtypes):
272    # A typed argument should have a space after the colon.
273    return True
274  if left.is_string:
275    if (rval == '=' and
276        format_token.Subtype.DEFAULT_OR_NAMED_ASSIGN_ARG_LIST in right.subtypes
277       ):
278      # If there is a type hint, then we don't want to add a space between the
279      # equal sign and the hint.
280      return False
281    if rval not in '[)]}.':
282      # A string followed by something other than a subscript, closing bracket,
283      # or dot should have a space after it.
284      return True
285  if left.is_binary_op and lval != '**' and _IsUnaryOperator(right):
286    # Space between the binary operator and the unary operator.
287    return True
288  if left.is_keyword and _IsUnaryOperator(right):
289    # Handle things like "not -3 < x".
290    return True
291  if _IsUnaryOperator(left) and _IsUnaryOperator(right):
292    # No space between two unary operators.
293    return False
294  if left.is_binary_op or right.is_binary_op:
295    if lval == '**' or rval == '**':
296      # Space around the "power" operator.
297      return style.Get('SPACES_AROUND_POWER_OPERATOR')
298    # Enforce spaces around binary operators except the blacklisted ones.
299    blacklist = style.Get('NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS')
300    return lval not in blacklist and rval not in blacklist
301  if (_IsUnaryOperator(left) and lval != 'not' and
302      (right.is_name or right.is_number or rval == '(')):
303    # The previous token was a unary op. No space is desired between it and
304    # the current token.
305    return False
306  if (format_token.Subtype.DEFAULT_OR_NAMED_ASSIGN in left.subtypes or
307      format_token.Subtype.DEFAULT_OR_NAMED_ASSIGN in right.subtypes):
308    # A named argument or default parameter shouldn't have spaces around it.
309    return style.Get('SPACES_AROUND_DEFAULT_OR_NAMED_ASSIGN')
310  if (format_token.Subtype.VARARGS_LIST in left.subtypes or
311      format_token.Subtype.VARARGS_LIST in right.subtypes):
312    return False
313  if (format_token.Subtype.VARARGS_STAR in left.subtypes or
314      format_token.Subtype.KWARGS_STAR_STAR in left.subtypes):
315    # Don't add a space after a vararg's star or a keyword's star-star.
316    return False
317  if lval == '@' and format_token.Subtype.DECORATOR in left.subtypes:
318    # Decorators shouldn't be separated from the 'at' sign.
319    return False
320  if left.is_keyword and rval == '.' or lval == '.' and right.is_keyword:
321    # Add space between keywords and dots.
322    return lval != 'None'
323  if lval == '.' or rval == '.':
324    # Don't place spaces between dots.
325    return False
326  if ((lval == '(' and rval == ')') or (lval == '[' and rval == ']') or
327      (lval == '{' and rval == '}')):
328    # Empty objects shouldn't be separated by spaces.
329    return False
330  if (lval in pytree_utils.OPENING_BRACKETS and
331      rval in pytree_utils.OPENING_BRACKETS):
332    # Nested objects' opening brackets shouldn't be separated.
333    return False
334  if (lval in pytree_utils.CLOSING_BRACKETS and
335      rval in pytree_utils.CLOSING_BRACKETS):
336    # Nested objects' closing brackets shouldn't be separated.
337    return False
338  if lval in pytree_utils.CLOSING_BRACKETS and rval in '([':
339    # A call, set, dictionary, or subscript that has a call or subscript after
340    # it shouldn't have a space between them.
341    return False
342  if lval in pytree_utils.OPENING_BRACKETS and _IsIdNumberStringToken(right):
343    # Don't separate the opening bracket from the first item.
344    return False
345  if left.is_name and rval in '([':
346    # Don't separate a call or array access from the name.
347    return False
348  if rval in pytree_utils.CLOSING_BRACKETS:
349    # Don't separate the closing bracket from the last item.
350    # FIXME(morbo): This might be too permissive.
351    return False
352  if lval == 'print' and rval == '(':
353    # Special support for the 'print' function.
354    return False
355  if lval in pytree_utils.OPENING_BRACKETS and _IsUnaryOperator(right):
356    # Don't separate a unary operator from the opening bracket.
357    return False
358  if (lval in pytree_utils.OPENING_BRACKETS and
359      (format_token.Subtype.VARARGS_STAR in right.subtypes or
360       format_token.Subtype.KWARGS_STAR_STAR in right.subtypes)):
361    # Don't separate a '*' or '**' from the opening bracket.
362    return False
363  if rval == ';':
364    # Avoid spaces before a semicolon. (Why is there a semicolon?!)
365    return False
366  if lval == '(' and rval == 'await':
367    # Special support for the 'await' keyword. Don't separate the 'await'
368    # keyword from an opening paren.
369    return False
370  return True
371
372
373def _MustBreakBefore(prev_token, cur_token):
374  """Return True if a line break is required before the current token."""
375  if prev_token.is_comment or (prev_token.previous_token and
376                               prev_token.is_pseudo_paren and
377                               prev_token.previous_token.is_comment):
378    # Must break if the previous token was a comment.
379    return True
380  if (cur_token.is_string and prev_token.is_string and
381      IsSurroundedByBrackets(cur_token)):
382    # We want consecutive strings to be on separate lines. This is a
383    # reasonable assumption, because otherwise they should have written them
384    # all on the same line, or with a '+'.
385    return True
386  return pytree_utils.GetNodeAnnotation(
387      cur_token.node, pytree_utils.Annotation.MUST_SPLIT, default=False)
388
389
390def _CanBreakBefore(prev_token, cur_token):
391  """Return True if a line break may occur before the current token."""
392  pval = prev_token.value
393  cval = cur_token.value
394  if py3compat.PY3:
395    if pval == 'yield' and cval == 'from':
396      # Don't break before a yield argument.
397      return False
398    if pval in {'async', 'await'} and cval in {'def', 'with', 'for'}:
399      # Don't break after sync keywords.
400      return False
401  if cur_token.split_penalty >= split_penalty.UNBREAKABLE:
402    return False
403  if pval == '@':
404    # Don't break right after the beginning of a decorator.
405    return False
406  if cval == ':':
407    # Don't break before the start of a block of code.
408    return False
409  if cval == ',':
410    # Don't break before a comma.
411    return False
412  if prev_token.is_name and cval == '(':
413    # Don't break in the middle of a function definition or call.
414    return False
415  if prev_token.is_name and cval == '[':
416    # Don't break in the middle of an array dereference.
417    return False
418  if prev_token.is_name and cval == '.':
419    # Don't break before the '.' in a dotted name.
420    return False
421  if cur_token.is_comment and prev_token.lineno == cur_token.lineno:
422    # Don't break a comment at the end of the line.
423    return False
424  if format_token.Subtype.UNARY_OPERATOR in prev_token.subtypes:
425    # Don't break after a unary token.
426    return False
427  return True
428
429
430def IsSurroundedByBrackets(tok):
431  """Return True if the token is surrounded by brackets."""
432  paren_count = 0
433  brace_count = 0
434  sq_bracket_count = 0
435  previous_token = tok.previous_token
436  while previous_token:
437    if previous_token.value == ')':
438      paren_count -= 1
439    elif previous_token.value == '}':
440      brace_count -= 1
441    elif previous_token.value == ']':
442      sq_bracket_count -= 1
443
444    if previous_token.value == '(':
445      if paren_count == 0:
446        return previous_token
447      paren_count += 1
448    elif previous_token.value == '{':
449      if brace_count == 0:
450        return previous_token
451      brace_count += 1
452    elif previous_token.value == '[':
453      if sq_bracket_count == 0:
454        return previous_token
455      sq_bracket_count += 1
456
457    previous_token = previous_token.previous_token
458  return None
459
460
461_LOGICAL_OPERATORS = frozenset({'and', 'or'})
462_BITWISE_OPERATORS = frozenset({'&', '|', '^'})
463_TERM_OPERATORS = frozenset({'*', '/', '%', '//'})
464
465
466def _SplitPenalty(prev_token, cur_token):
467  """Return the penalty for breaking the line before the current token."""
468  pval = prev_token.value
469  cval = cur_token.value
470  if pval == 'not':
471    return split_penalty.UNBREAKABLE
472
473  if cur_token.node_split_penalty > 0:
474    return cur_token.node_split_penalty
475
476  if style.Get('SPLIT_BEFORE_LOGICAL_OPERATOR'):
477    # Prefer to split before 'and' and 'or'.
478    if pval in _LOGICAL_OPERATORS:
479      return style.Get('SPLIT_PENALTY_LOGICAL_OPERATOR')
480    if cval in _LOGICAL_OPERATORS:
481      return 0
482  else:
483    # Prefer to split after 'and' and 'or'.
484    if pval in _LOGICAL_OPERATORS:
485      return 0
486    if cval in _LOGICAL_OPERATORS:
487      return style.Get('SPLIT_PENALTY_LOGICAL_OPERATOR')
488
489  if style.Get('SPLIT_BEFORE_BITWISE_OPERATOR'):
490    # Prefer to split before '&', '|', and '^'.
491    if pval in _BITWISE_OPERATORS:
492      return style.Get('SPLIT_PENALTY_BITWISE_OPERATOR')
493    if cval in _BITWISE_OPERATORS:
494      return 0
495  else:
496    # Prefer to split after '&', '|', and '^'.
497    if pval in _BITWISE_OPERATORS:
498      return 0
499    if cval in _BITWISE_OPERATORS:
500      return style.Get('SPLIT_PENALTY_BITWISE_OPERATOR')
501
502  if (format_token.Subtype.COMP_FOR in cur_token.subtypes or
503      format_token.Subtype.COMP_IF in cur_token.subtypes):
504    # We don't mind breaking before the 'for' or 'if' of a list comprehension.
505    return 0
506  if format_token.Subtype.UNARY_OPERATOR in prev_token.subtypes:
507    # Try not to break after a unary operator.
508    return style.Get('SPLIT_PENALTY_AFTER_UNARY_OPERATOR')
509  if pval == ',':
510    # Breaking after a comma is fine, if need be.
511    return 0
512  if prev_token.is_binary_op:
513    # We would rather not split after an equality operator.
514    return 20
515  if (format_token.Subtype.VARARGS_STAR in prev_token.subtypes or
516      format_token.Subtype.KWARGS_STAR_STAR in prev_token.subtypes):
517    # Don't split after a varargs * or kwargs **.
518    return split_penalty.UNBREAKABLE
519  if prev_token.OpensScope() and cval != '(':
520    # Slightly prefer
521    return style.Get('SPLIT_PENALTY_AFTER_OPENING_BRACKET')
522  if cval == ':':
523    # Don't split before a colon.
524    return split_penalty.UNBREAKABLE
525  if cval == '=':
526    # Don't split before an assignment.
527    return split_penalty.UNBREAKABLE
528  if (format_token.Subtype.DEFAULT_OR_NAMED_ASSIGN in prev_token.subtypes or
529      format_token.Subtype.DEFAULT_OR_NAMED_ASSIGN in cur_token.subtypes):
530    # Don't break before or after an default or named assignment.
531    return split_penalty.UNBREAKABLE
532  if cval == '==':
533    # We would rather not split before an equality operator.
534    return split_penalty.STRONGLY_CONNECTED
535  if cur_token.ClosesScope():
536    # Give a slight penalty for splitting before the closing scope.
537    return 100
538  if pval in _TERM_OPERATORS or cval in _TERM_OPERATORS:
539    return 50
540  return 0
541