1 //===--- FormatToken.h - Format C++ code ------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief This file contains the declaration of the FormatToken, a wrapper 12 /// around Token with additional information related to formatting. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKEN_H 17 #define LLVM_CLANG_LIB_FORMAT_FORMATTOKEN_H 18 19 #include "clang/Basic/IdentifierTable.h" 20 #include "clang/Basic/OperatorPrecedence.h" 21 #include "clang/Format/Format.h" 22 #include "clang/Lex/Lexer.h" 23 #include <memory> 24 25 namespace clang { 26 namespace format { 27 28 enum TokenType { 29 TT_ArrayInitializerLSquare, 30 TT_ArraySubscriptLSquare, 31 TT_AttributeParen, 32 TT_BinaryOperator, 33 TT_BitFieldColon, 34 TT_BlockComment, 35 TT_CastRParen, 36 TT_ConditionalExpr, 37 TT_ConflictAlternative, 38 TT_ConflictEnd, 39 TT_ConflictStart, 40 TT_CtorInitializerColon, 41 TT_CtorInitializerComma, 42 TT_DesignatedInitializerPeriod, 43 TT_DictLiteral, 44 TT_FunctionDeclarationName, 45 TT_FunctionLBrace, 46 TT_FunctionTypeLParen, 47 TT_ImplicitStringLiteral, 48 TT_InheritanceColon, 49 TT_InlineASMColon, 50 TT_JavaAnnotation, 51 TT_JsTypeColon, 52 TT_JsTypeOptionalQuestion, 53 TT_LambdaArrow, 54 TT_LambdaLSquare, 55 TT_LeadingJavaAnnotation, 56 TT_LineComment, 57 TT_ObjCBlockLBrace, 58 TT_ObjCBlockLParen, 59 TT_ObjCDecl, 60 TT_ObjCForIn, 61 TT_ObjCMethodExpr, 62 TT_ObjCMethodSpecifier, 63 TT_ObjCProperty, 64 TT_OverloadedOperator, 65 TT_OverloadedOperatorLParen, 66 TT_PointerOrReference, 67 TT_PureVirtualSpecifier, 68 TT_RangeBasedForLoopColon, 69 TT_RegexLiteral, 70 TT_SelectorName, 71 TT_StartOfName, 72 TT_TemplateCloser, 73 TT_TemplateOpener, 74 TT_TemplateString, 75 TT_TrailingAnnotation, 76 TT_TrailingReturnArrow, 77 TT_TrailingUnaryOperator, 78 TT_UnaryOperator, 79 TT_Unknown 80 }; 81 82 // Represents what type of block a set of braces open. 83 enum BraceBlockKind { 84 BK_Unknown, 85 BK_Block, 86 BK_BracedInit 87 }; 88 89 // The packing kind of a function's parameters. 90 enum ParameterPackingKind { 91 PPK_BinPacked, 92 PPK_OnePerLine, 93 PPK_Inconclusive 94 }; 95 96 enum FormatDecision { 97 FD_Unformatted, 98 FD_Continue, 99 FD_Break 100 }; 101 102 class TokenRole; 103 class AnnotatedLine; 104 105 /// \brief A wrapper around a \c Token storing information about the 106 /// whitespace characters preceding it. 107 struct FormatToken { FormatTokenFormatToken108 FormatToken() 109 : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0), 110 ColumnWidth(0), LastLineColumnWidth(0), IsMultiline(false), 111 IsFirst(false), MustBreakBefore(false), IsUnterminatedLiteral(false), 112 BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0), 113 CanBreakBefore(false), ClosesTemplateDeclaration(false), 114 ParameterCount(0), BlockParameterCount(0), 115 PackingKind(PPK_Inconclusive), TotalLength(0), UnbreakableTailLength(0), 116 BindingStrength(0), NestingLevel(0), SplitPenalty(0), 117 LongestObjCSelectorName(0), FakeRParens(0), 118 StartsBinaryExpression(false), EndsBinaryExpression(false), 119 OperatorIndex(0), LastOperator(false), 120 PartOfMultiVariableDeclStmt(false), IsForEachMacro(false), 121 MatchingParen(nullptr), Previous(nullptr), Next(nullptr), 122 Decision(FD_Unformatted), Finalized(false) {} 123 124 /// \brief The \c Token. 125 Token Tok; 126 127 /// \brief The number of newlines immediately before the \c Token. 128 /// 129 /// This can be used to determine what the user wrote in the original code 130 /// and thereby e.g. leave an empty line between two function definitions. 131 unsigned NewlinesBefore; 132 133 /// \brief Whether there is at least one unescaped newline before the \c 134 /// Token. 135 bool HasUnescapedNewline; 136 137 /// \brief The range of the whitespace immediately preceding the \c Token. 138 SourceRange WhitespaceRange; 139 140 /// \brief The offset just past the last '\n' in this token's leading 141 /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. 142 unsigned LastNewlineOffset; 143 144 /// \brief The width of the non-whitespace parts of the token (or its first 145 /// line for multi-line tokens) in columns. 146 /// We need this to correctly measure number of columns a token spans. 147 unsigned ColumnWidth; 148 149 /// \brief Contains the width in columns of the last line of a multi-line 150 /// token. 151 unsigned LastLineColumnWidth; 152 153 /// \brief Whether the token text contains newlines (escaped or not). 154 bool IsMultiline; 155 156 /// \brief Indicates that this is the first token. 157 bool IsFirst; 158 159 /// \brief Whether there must be a line break before this token. 160 /// 161 /// This happens for example when a preprocessor directive ended directly 162 /// before the token. 163 bool MustBreakBefore; 164 165 /// \brief Returns actual token start location without leading escaped 166 /// newlines and whitespace. 167 /// 168 /// This can be different to Tok.getLocation(), which includes leading escaped 169 /// newlines. getStartOfNonWhitespaceFormatToken170 SourceLocation getStartOfNonWhitespace() const { 171 return WhitespaceRange.getEnd(); 172 } 173 174 /// \brief The raw text of the token. 175 /// 176 /// Contains the raw token text without leading whitespace and without leading 177 /// escaped newlines. 178 StringRef TokenText; 179 180 /// \brief Set to \c true if this token is an unterminated literal. 181 bool IsUnterminatedLiteral; 182 183 /// \brief Contains the kind of block if this token is a brace. 184 BraceBlockKind BlockKind; 185 186 TokenType Type; 187 188 /// \brief The number of spaces that should be inserted before this token. 189 unsigned SpacesRequiredBefore; 190 191 /// \brief \c true if it is allowed to break before this token. 192 bool CanBreakBefore; 193 194 bool ClosesTemplateDeclaration; 195 196 /// \brief Number of parameters, if this is "(", "[" or "<". 197 /// 198 /// This is initialized to 1 as we don't need to distinguish functions with 199 /// 0 parameters from functions with 1 parameter. Thus, we can simply count 200 /// the number of commas. 201 unsigned ParameterCount; 202 203 /// \brief Number of parameters that are nested blocks, 204 /// if this is "(", "[" or "<". 205 unsigned BlockParameterCount; 206 207 /// \brief A token can have a special role that can carry extra information 208 /// about the token's formatting. 209 std::unique_ptr<TokenRole> Role; 210 211 /// \brief If this is an opening parenthesis, how are the parameters packed? 212 ParameterPackingKind PackingKind; 213 214 /// \brief The total length of the unwrapped line up to and including this 215 /// token. 216 unsigned TotalLength; 217 218 /// \brief The original 0-based column of this token, including expanded tabs. 219 /// The configured TabWidth is used as tab width. 220 unsigned OriginalColumn; 221 222 /// \brief The length of following tokens until the next natural split point, 223 /// or the next token that can be broken. 224 unsigned UnbreakableTailLength; 225 226 // FIXME: Come up with a 'cleaner' concept. 227 /// \brief The binding strength of a token. This is a combined value of 228 /// operator precedence, parenthesis nesting, etc. 229 unsigned BindingStrength; 230 231 /// \brief The nesting level of this token, i.e. the number of surrounding (), 232 /// [], {} or <>. 233 unsigned NestingLevel; 234 235 /// \brief Penalty for inserting a line break before this token. 236 unsigned SplitPenalty; 237 238 /// \brief If this is the first ObjC selector name in an ObjC method 239 /// definition or call, this contains the length of the longest name. 240 /// 241 /// This being set to 0 means that the selectors should not be colon-aligned, 242 /// e.g. because several of them are block-type. 243 unsigned LongestObjCSelectorName; 244 245 /// \brief Stores the number of required fake parentheses and the 246 /// corresponding operator precedence. 247 /// 248 /// If multiple fake parentheses start at a token, this vector stores them in 249 /// reverse order, i.e. inner fake parenthesis first. 250 SmallVector<prec::Level, 4> FakeLParens; 251 /// \brief Insert this many fake ) after this token for correct indentation. 252 unsigned FakeRParens; 253 254 /// \brief \c true if this token starts a binary expression, i.e. has at least 255 /// one fake l_paren with a precedence greater than prec::Unknown. 256 bool StartsBinaryExpression; 257 /// \brief \c true if this token ends a binary expression. 258 bool EndsBinaryExpression; 259 260 /// \brief Is this is an operator (or "."/"->") in a sequence of operators 261 /// with the same precedence, contains the 0-based operator index. 262 unsigned OperatorIndex; 263 264 /// \brief Is this the last operator (or "."/"->") in a sequence of operators 265 /// with the same precedence? 266 bool LastOperator; 267 268 /// \brief Is this token part of a \c DeclStmt defining multiple variables? 269 /// 270 /// Only set if \c Type == \c TT_StartOfName. 271 bool PartOfMultiVariableDeclStmt; 272 273 /// \brief Is this a foreach macro? 274 bool IsForEachMacro; 275 isFormatToken276 bool is(tok::TokenKind Kind) const { return Tok.is(Kind); } isFormatToken277 bool is(TokenType TT) const { return Type == TT; } isFormatToken278 bool is(const IdentifierInfo *II) const { 279 return II && II == Tok.getIdentifierInfo(); 280 } isOneOfFormatToken281 template <typename A, typename B> bool isOneOf(A K1, B K2) const { 282 return is(K1) || is(K2); 283 } 284 template <typename A, typename B, typename... Ts> isOneOfFormatToken285 bool isOneOf(A K1, B K2, Ts... Ks) const { 286 return is(K1) || isOneOf(K2, Ks...); 287 } isNotFormatToken288 template <typename T> bool isNot(T Kind) const { return !is(Kind); } 289 isStringLiteralFormatToken290 bool isStringLiteral() const { return tok::isStringLiteral(Tok.getKind()); } 291 isObjCAtKeywordFormatToken292 bool isObjCAtKeyword(tok::ObjCKeywordKind Kind) const { 293 return Tok.isObjCAtKeyword(Kind); 294 } 295 296 bool isAccessSpecifier(bool ColonRequired = true) const { 297 return isOneOf(tok::kw_public, tok::kw_protected, tok::kw_private) && 298 (!ColonRequired || (Next && Next->is(tok::colon))); 299 } 300 301 /// \brief Determine whether the token is a simple-type-specifier. 302 bool isSimpleTypeSpecifier() const; 303 isObjCAccessSpecifierFormatToken304 bool isObjCAccessSpecifier() const { 305 return is(tok::at) && Next && (Next->isObjCAtKeyword(tok::objc_public) || 306 Next->isObjCAtKeyword(tok::objc_protected) || 307 Next->isObjCAtKeyword(tok::objc_package) || 308 Next->isObjCAtKeyword(tok::objc_private)); 309 } 310 311 /// \brief Returns whether \p Tok is ([{ or a template opening <. opensScopeFormatToken312 bool opensScope() const { 313 return isOneOf(tok::l_paren, tok::l_brace, tok::l_square, 314 TT_TemplateOpener); 315 } 316 /// \brief Returns whether \p Tok is )]} or a template closing >. closesScopeFormatToken317 bool closesScope() const { 318 return isOneOf(tok::r_paren, tok::r_brace, tok::r_square, 319 TT_TemplateCloser); 320 } 321 322 /// \brief Returns \c true if this is a "." or "->" accessing a member. isMemberAccessFormatToken323 bool isMemberAccess() const { 324 return isOneOf(tok::arrow, tok::period, tok::arrowstar) && 325 !isOneOf(TT_DesignatedInitializerPeriod, TT_TrailingReturnArrow, 326 TT_LambdaArrow); 327 } 328 isUnaryOperatorFormatToken329 bool isUnaryOperator() const { 330 switch (Tok.getKind()) { 331 case tok::plus: 332 case tok::plusplus: 333 case tok::minus: 334 case tok::minusminus: 335 case tok::exclaim: 336 case tok::tilde: 337 case tok::kw_sizeof: 338 case tok::kw_alignof: 339 return true; 340 default: 341 return false; 342 } 343 } 344 isBinaryOperatorFormatToken345 bool isBinaryOperator() const { 346 // Comma is a binary operator, but does not behave as such wrt. formatting. 347 return getPrecedence() > prec::Comma; 348 } 349 isTrailingCommentFormatToken350 bool isTrailingComment() const { 351 return is(tok::comment) && 352 (is(TT_LineComment) || !Next || Next->NewlinesBefore > 0); 353 } 354 355 /// \brief Returns \c true if this is a keyword that can be used 356 /// like a function call (e.g. sizeof, typeid, ...). isFunctionLikeKeywordFormatToken357 bool isFunctionLikeKeyword() const { 358 switch (Tok.getKind()) { 359 case tok::kw_throw: 360 case tok::kw_typeid: 361 case tok::kw_return: 362 case tok::kw_sizeof: 363 case tok::kw_alignof: 364 case tok::kw_alignas: 365 case tok::kw_decltype: 366 case tok::kw_noexcept: 367 case tok::kw_static_assert: 368 case tok::kw___attribute: 369 return true; 370 default: 371 return false; 372 } 373 } 374 getPrecedenceFormatToken375 prec::Level getPrecedence() const { 376 return getBinOpPrecedence(Tok.getKind(), true, true); 377 } 378 379 /// \brief Returns the previous token ignoring comments. getPreviousNonCommentFormatToken380 FormatToken *getPreviousNonComment() const { 381 FormatToken *Tok = Previous; 382 while (Tok && Tok->is(tok::comment)) 383 Tok = Tok->Previous; 384 return Tok; 385 } 386 387 /// \brief Returns the next token ignoring comments. getNextNonCommentFormatToken388 const FormatToken *getNextNonComment() const { 389 const FormatToken *Tok = Next; 390 while (Tok && Tok->is(tok::comment)) 391 Tok = Tok->Next; 392 return Tok; 393 } 394 395 /// \brief Returns \c true if this tokens starts a block-type list, i.e. a 396 /// list that should be indented with a block indent. opensBlockTypeListFormatToken397 bool opensBlockTypeList(const FormatStyle &Style) const { 398 return is(TT_ArrayInitializerLSquare) || 399 (is(tok::l_brace) && 400 (BlockKind == BK_Block || is(TT_DictLiteral) || 401 (!Style.Cpp11BracedListStyle && NestingLevel == 0))); 402 } 403 404 /// \brief Same as opensBlockTypeList, but for the closing token. closesBlockTypeListFormatToken405 bool closesBlockTypeList(const FormatStyle &Style) const { 406 return MatchingParen && MatchingParen->opensBlockTypeList(Style); 407 } 408 409 FormatToken *MatchingParen; 410 411 FormatToken *Previous; 412 FormatToken *Next; 413 414 SmallVector<AnnotatedLine *, 1> Children; 415 416 /// \brief Stores the formatting decision for the token once it was made. 417 FormatDecision Decision; 418 419 /// \brief If \c true, this token has been fully formatted (indented and 420 /// potentially re-formatted inside), and we do not allow further formatting 421 /// changes. 422 bool Finalized; 423 424 private: 425 // Disallow copying. 426 FormatToken(const FormatToken &) = delete; 427 void operator=(const FormatToken &) = delete; 428 }; 429 430 class ContinuationIndenter; 431 struct LineState; 432 433 class TokenRole { 434 public: TokenRole(const FormatStyle & Style)435 TokenRole(const FormatStyle &Style) : Style(Style) {} 436 virtual ~TokenRole(); 437 438 /// \brief After the \c TokenAnnotator has finished annotating all the tokens, 439 /// this function precomputes required information for formatting. 440 virtual void precomputeFormattingInfos(const FormatToken *Token); 441 442 /// \brief Apply the special formatting that the given role demands. 443 /// 444 /// Assumes that the token having this role is already formatted. 445 /// 446 /// Continues formatting from \p State leaving indentation to \p Indenter and 447 /// returns the total penalty that this formatting incurs. formatFromToken(LineState & State,ContinuationIndenter * Indenter,bool DryRun)448 virtual unsigned formatFromToken(LineState &State, 449 ContinuationIndenter *Indenter, 450 bool DryRun) { 451 return 0; 452 } 453 454 /// \brief Same as \c formatFromToken, but assumes that the first token has 455 /// already been set thereby deciding on the first line break. formatAfterToken(LineState & State,ContinuationIndenter * Indenter,bool DryRun)456 virtual unsigned formatAfterToken(LineState &State, 457 ContinuationIndenter *Indenter, 458 bool DryRun) { 459 return 0; 460 } 461 462 /// \brief Notifies the \c Role that a comma was found. CommaFound(const FormatToken * Token)463 virtual void CommaFound(const FormatToken *Token) {} 464 465 protected: 466 const FormatStyle &Style; 467 }; 468 469 class CommaSeparatedList : public TokenRole { 470 public: CommaSeparatedList(const FormatStyle & Style)471 CommaSeparatedList(const FormatStyle &Style) 472 : TokenRole(Style), HasNestedBracedList(false) {} 473 474 void precomputeFormattingInfos(const FormatToken *Token) override; 475 476 unsigned formatAfterToken(LineState &State, ContinuationIndenter *Indenter, 477 bool DryRun) override; 478 479 unsigned formatFromToken(LineState &State, ContinuationIndenter *Indenter, 480 bool DryRun) override; 481 482 /// \brief Adds \p Token as the next comma to the \c CommaSeparated list. CommaFound(const FormatToken * Token)483 void CommaFound(const FormatToken *Token) override { 484 Commas.push_back(Token); 485 } 486 487 private: 488 /// \brief A struct that holds information on how to format a given list with 489 /// a specific number of columns. 490 struct ColumnFormat { 491 /// \brief The number of columns to use. 492 unsigned Columns; 493 494 /// \brief The total width in characters. 495 unsigned TotalWidth; 496 497 /// \brief The number of lines required for this format. 498 unsigned LineCount; 499 500 /// \brief The size of each column in characters. 501 SmallVector<unsigned, 8> ColumnSizes; 502 }; 503 504 /// \brief Calculate which \c ColumnFormat fits best into 505 /// \p RemainingCharacters. 506 const ColumnFormat *getColumnFormat(unsigned RemainingCharacters) const; 507 508 /// \brief The ordered \c FormatTokens making up the commas of this list. 509 SmallVector<const FormatToken *, 8> Commas; 510 511 /// \brief The length of each of the list's items in characters including the 512 /// trailing comma. 513 SmallVector<unsigned, 8> ItemLengths; 514 515 /// \brief Precomputed formats that can be used for this list. 516 SmallVector<ColumnFormat, 4> Formats; 517 518 bool HasNestedBracedList; 519 }; 520 521 /// \brief Encapsulates keywords that are context sensitive or for languages not 522 /// properly supported by Clang's lexer. 523 struct AdditionalKeywords { AdditionalKeywordsAdditionalKeywords524 AdditionalKeywords(IdentifierTable &IdentTable) { 525 kw_in = &IdentTable.get("in"); 526 kw_CF_ENUM = &IdentTable.get("CF_ENUM"); 527 kw_CF_OPTIONS = &IdentTable.get("CF_OPTIONS"); 528 kw_NS_ENUM = &IdentTable.get("NS_ENUM"); 529 kw_NS_OPTIONS = &IdentTable.get("NS_OPTIONS"); 530 531 kw_finally = &IdentTable.get("finally"); 532 kw_function = &IdentTable.get("function"); 533 kw_import = &IdentTable.get("import"); 534 kw_var = &IdentTable.get("var"); 535 536 kw_abstract = &IdentTable.get("abstract"); 537 kw_extends = &IdentTable.get("extends"); 538 kw_final = &IdentTable.get("final"); 539 kw_implements = &IdentTable.get("implements"); 540 kw_instanceof = &IdentTable.get("instanceof"); 541 kw_interface = &IdentTable.get("interface"); 542 kw_native = &IdentTable.get("native"); 543 kw_package = &IdentTable.get("package"); 544 kw_synchronized = &IdentTable.get("synchronized"); 545 kw_throws = &IdentTable.get("throws"); 546 kw___except = &IdentTable.get("__except"); 547 548 kw_option = &IdentTable.get("option"); 549 kw_optional = &IdentTable.get("optional"); 550 kw_repeated = &IdentTable.get("repeated"); 551 kw_required = &IdentTable.get("required"); 552 kw_returns = &IdentTable.get("returns"); 553 554 kw_signals = &IdentTable.get("signals"); 555 kw_slots = &IdentTable.get("slots"); 556 kw_qslots = &IdentTable.get("Q_SLOTS"); 557 } 558 559 // Context sensitive keywords. 560 IdentifierInfo *kw_in; 561 IdentifierInfo *kw_CF_ENUM; 562 IdentifierInfo *kw_CF_OPTIONS; 563 IdentifierInfo *kw_NS_ENUM; 564 IdentifierInfo *kw_NS_OPTIONS; 565 IdentifierInfo *kw___except; 566 567 // JavaScript keywords. 568 IdentifierInfo *kw_finally; 569 IdentifierInfo *kw_function; 570 IdentifierInfo *kw_import; 571 IdentifierInfo *kw_var; 572 573 // Java keywords. 574 IdentifierInfo *kw_abstract; 575 IdentifierInfo *kw_extends; 576 IdentifierInfo *kw_final; 577 IdentifierInfo *kw_implements; 578 IdentifierInfo *kw_instanceof; 579 IdentifierInfo *kw_interface; 580 IdentifierInfo *kw_native; 581 IdentifierInfo *kw_package; 582 IdentifierInfo *kw_synchronized; 583 IdentifierInfo *kw_throws; 584 585 // Proto keywords. 586 IdentifierInfo *kw_option; 587 IdentifierInfo *kw_optional; 588 IdentifierInfo *kw_repeated; 589 IdentifierInfo *kw_required; 590 IdentifierInfo *kw_returns; 591 592 // QT keywords. 593 IdentifierInfo *kw_signals; 594 IdentifierInfo *kw_slots; 595 IdentifierInfo *kw_qslots; 596 }; 597 598 } // namespace format 599 } // namespace clang 600 601 #endif 602