1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file defines classes for searching and analyzing source code clones. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H 15 #define LLVM_CLANG_AST_CLONEDETECTION_H 16 17 #include "clang/AST/StmtVisitor.h" 18 #include "llvm/Support/Regex.h" 19 #include <vector> 20 21 namespace clang { 22 23 class Stmt; 24 class Decl; 25 class VarDecl; 26 class ASTContext; 27 class CompoundStmt; 28 29 /// Identifies a list of statements. 30 /// 31 /// Can either identify a single arbitrary Stmt object, a continuous sequence of 32 /// child statements inside a CompoundStmt or no statements at all. 33 class StmtSequence { 34 /// If this object identifies a sequence of statements inside a CompoundStmt, 35 /// S points to this CompoundStmt. If this object only identifies a single 36 /// Stmt, then S is a pointer to this Stmt. 37 const Stmt *S; 38 39 /// The declaration that contains the statements. 40 const Decl *D; 41 42 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence 43 /// instance is representing the CompoundStmt children inside the array 44 /// [StartIndex, EndIndex). 45 unsigned StartIndex; 46 unsigned EndIndex; 47 48 public: 49 /// Constructs a StmtSequence holding multiple statements. 50 /// 51 /// The resulting StmtSequence identifies a continuous sequence of statements 52 /// in the body of the given CompoundStmt. Which statements of the body should 53 /// be identified needs to be specified by providing a start and end index 54 /// that describe a non-empty sub-array in the body of the given CompoundStmt. 55 /// 56 /// \param Stmt A CompoundStmt that contains all statements in its body. 57 /// \param D The Decl containing this Stmt. 58 /// \param StartIndex The inclusive start index in the children array of 59 /// \p Stmt 60 /// \param EndIndex The exclusive end index in the children array of \p Stmt. 61 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, 62 unsigned EndIndex); 63 64 /// Constructs a StmtSequence holding a single statement. 65 /// 66 /// \param Stmt An arbitrary Stmt. 67 /// \param D The Decl containing this Stmt. 68 StmtSequence(const Stmt *Stmt, const Decl *D); 69 70 /// Constructs an empty StmtSequence. 71 StmtSequence(); 72 73 typedef const Stmt *const *iterator; 74 75 /// Returns an iterator pointing to the first statement in this sequence. 76 iterator begin() const; 77 78 /// Returns an iterator pointing behind the last statement in this sequence. 79 iterator end() const; 80 81 /// Returns the first statement in this sequence. 82 /// 83 /// This method should only be called on a non-empty StmtSequence object. front()84 const Stmt *front() const { 85 assert(!empty()); 86 return begin()[0]; 87 } 88 89 /// Returns the last statement in this sequence. 90 /// 91 /// This method should only be called on a non-empty StmtSequence object. back()92 const Stmt *back() const { 93 assert(!empty()); 94 return begin()[size() - 1]; 95 } 96 97 /// Returns the number of statements this object holds. size()98 unsigned size() const { 99 if (holdsSequence()) 100 return EndIndex - StartIndex; 101 if (S == nullptr) 102 return 0; 103 return 1; 104 } 105 106 /// Returns true if and only if this StmtSequence contains no statements. empty()107 bool empty() const { return size() == 0; } 108 109 /// Returns the related ASTContext for the stored Stmts. 110 ASTContext &getASTContext() const; 111 112 /// Returns the declaration that contains the stored Stmts. getContainingDecl()113 const Decl *getContainingDecl() const { 114 assert(D); 115 return D; 116 } 117 118 /// Returns true if this objects holds a list of statements. holdsSequence()119 bool holdsSequence() const { return EndIndex != 0; } 120 121 /// Returns the start sourcelocation of the first statement in this sequence. 122 /// 123 /// This method should only be called on a non-empty StmtSequence object. 124 SourceLocation getBeginLoc() const; 125 126 /// Returns the end sourcelocation of the last statement in this sequence. 127 /// 128 /// This method should only be called on a non-empty StmtSequence object. 129 SourceLocation getEndLoc() const; 130 131 /// Returns the source range of the whole sequence - from the beginning 132 /// of the first statement to the end of the last statement. 133 SourceRange getSourceRange() const; 134 135 bool operator==(const StmtSequence &Other) const { 136 return std::tie(S, StartIndex, EndIndex) == 137 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 138 } 139 140 bool operator!=(const StmtSequence &Other) const { 141 return std::tie(S, StartIndex, EndIndex) != 142 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 143 } 144 145 /// Returns true if and only if this sequence covers a source range that 146 /// contains the source range of the given sequence \p Other. 147 /// 148 /// This method should only be called on a non-empty StmtSequence object 149 /// and passed a non-empty StmtSequence object. 150 bool contains(const StmtSequence &Other) const; 151 }; 152 153 /// Searches for similar subtrees in the AST. 154 /// 155 /// First, this class needs several declarations with statement bodies which 156 /// can be passed via analyzeCodeBody. Afterwards all statements can be 157 /// searched for clones by calling findClones with a given list of constraints 158 /// that should specify the wanted properties of the clones. 159 /// 160 /// The result of findClones can be further constrained with the constrainClones 161 /// method. 162 /// 163 /// This class only searches for clones in executable source code 164 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) 165 /// are not supported. 166 class CloneDetector { 167 168 public: 169 /// A collection of StmtSequences that share an arbitrary property. 170 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; 171 172 /// Generates and stores search data for all statements in the body of 173 /// the given Decl. 174 void analyzeCodeBody(const Decl *D); 175 176 /// Constrains the given list of clone groups with the given constraint. 177 /// 178 /// The constraint is expected to have a method with the signature 179 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` 180 /// as this is the interface that the CloneDetector uses for applying the 181 /// constraint. The constraint is supposed to directly modify the passed list 182 /// so that all clones in the list fulfill the specific property this 183 /// constraint ensures. 184 template <typename T> constrainClones(std::vector<CloneGroup> & CloneGroups,T C)185 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { 186 C.constrain(CloneGroups); 187 } 188 189 /// Constrains the given list of clone groups with the given list of 190 /// constraints. 191 /// 192 /// The constraints are applied in sequence in the order in which they are 193 /// passed to this function. 194 template <typename T1, typename... Ts> constrainClones(std::vector<CloneGroup> & CloneGroups,T1 C,Ts...ConstraintList)195 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, 196 Ts... ConstraintList) { 197 constrainClones(CloneGroups, C); 198 constrainClones(CloneGroups, ConstraintList...); 199 } 200 201 /// Searches for clones in all previously passed statements. 202 /// \param Result Output parameter to which all created clone groups are 203 /// added. 204 /// \param ConstraintList The constraints that should be applied to the 205 // result. 206 template <typename... Ts> findClones(std::vector<CloneGroup> & Result,Ts...ConstraintList)207 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { 208 // The initial assumption is that there is only one clone group and every 209 // statement is a clone of the others. This clone group will then be 210 // split up with the help of the constraints. 211 CloneGroup AllClones; 212 AllClones.reserve(Sequences.size()); 213 for (const auto &C : Sequences) { 214 AllClones.push_back(C); 215 } 216 217 Result.push_back(AllClones); 218 219 constrainClones(Result, ConstraintList...); 220 } 221 222 private: 223 CloneGroup Sequences; 224 }; 225 226 /// This class is a utility class that contains utility functions for building 227 /// custom constraints. 228 class CloneConstraint { 229 public: 230 /// Removes all groups by using a filter function. 231 /// \param CloneGroups The list of CloneGroups that is supposed to be 232 /// filtered. 233 /// \param Filter The filter function that should return true for all groups 234 /// that should be removed from the list. filterGroups(std::vector<CloneDetector::CloneGroup> & CloneGroups,llvm::function_ref<bool (const CloneDetector::CloneGroup &)> Filter)235 static void filterGroups( 236 std::vector<CloneDetector::CloneGroup> &CloneGroups, 237 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { 238 CloneGroups.erase( 239 std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter), 240 CloneGroups.end()); 241 } 242 243 /// Splits the given CloneGroups until the given Compare function returns true 244 /// for all clones in a single group. 245 /// \param CloneGroups A list of CloneGroups that should be modified. 246 /// \param Compare The comparison function that all clones are supposed to 247 /// pass. Should return true if and only if two clones belong 248 /// to the same CloneGroup. 249 static void splitCloneGroups( 250 std::vector<CloneDetector::CloneGroup> &CloneGroups, 251 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 252 Compare); 253 }; 254 255 /// This constraint moves clones into clone groups of type II via hashing. 256 /// 257 /// Clones with different hash values are moved into separate clone groups. 258 /// Collisions are possible, and this constraint does nothing to address this 259 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the 260 /// constraint chain, not necessarily immediately, to eliminate hash collisions 261 /// through a more detailed analysis. 262 class RecursiveCloneTypeIIHashConstraint { 263 public: 264 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 265 }; 266 267 /// This constraint moves clones into clone groups of type II by comparing them. 268 /// 269 /// Clones that aren't type II clones are moved into separate clone groups. 270 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone 271 /// group are guaranteed to be be type II clones of each other, but it is too 272 /// slow to efficiently handle large amounts of clones. 273 class RecursiveCloneTypeIIVerifyConstraint { 274 public: 275 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 276 }; 277 278 /// Ensures that every clone has at least the given complexity. 279 /// 280 /// Complexity is here defined as the total amount of children of a statement. 281 /// This constraint assumes the first statement in the group is representative 282 /// for all other statements in the group in terms of complexity. 283 class MinComplexityConstraint { 284 unsigned MinComplexity; 285 286 public: MinComplexityConstraint(unsigned MinComplexity)287 MinComplexityConstraint(unsigned MinComplexity) 288 : MinComplexity(MinComplexity) {} 289 290 /// Calculates the complexity of the given StmtSequence. 291 /// \param Limit The limit of complexity we probe for. After reaching 292 /// this limit during calculation, this method is exiting 293 /// early to improve performance and returns this limit. 294 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, 295 const std::string &ParentMacroStack = ""); 296 constrain(std::vector<CloneDetector::CloneGroup> & CloneGroups)297 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 298 CloneConstraint::filterGroups( 299 CloneGroups, [this](const CloneDetector::CloneGroup &A) { 300 if (!A.empty()) 301 return calculateStmtComplexity(A.front(), MinComplexity) < 302 MinComplexity; 303 else 304 return false; 305 }); 306 } 307 }; 308 309 /// Ensures that all clone groups contain at least the given amount of clones. 310 class MinGroupSizeConstraint { 311 unsigned MinGroupSize; 312 313 public: 314 MinGroupSizeConstraint(unsigned MinGroupSize = 2) MinGroupSize(MinGroupSize)315 : MinGroupSize(MinGroupSize) {} 316 constrain(std::vector<CloneDetector::CloneGroup> & CloneGroups)317 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 318 CloneConstraint::filterGroups(CloneGroups, 319 [this](const CloneDetector::CloneGroup &A) { 320 return A.size() < MinGroupSize; 321 }); 322 } 323 }; 324 325 /// Ensures that no clone group fully contains another clone group. 326 struct OnlyLargestCloneConstraint { 327 void constrain(std::vector<CloneDetector::CloneGroup> &Result); 328 }; 329 330 struct FilenamePatternConstraint { 331 StringRef IgnoredFilesPattern; 332 std::shared_ptr<llvm::Regex> IgnoredFilesRegex; 333 FilenamePatternConstraintFilenamePatternConstraint334 FilenamePatternConstraint(StringRef IgnoredFilesPattern) 335 : IgnoredFilesPattern(IgnoredFilesPattern) { 336 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + 337 IgnoredFilesPattern.str() + "$)"); 338 } 339 340 bool isAutoGenerated(const CloneDetector::CloneGroup &Group); 341 constrainFilenamePatternConstraint342 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 343 CloneConstraint::filterGroups( 344 CloneGroups, [this](const CloneDetector::CloneGroup &Group) { 345 return isAutoGenerated(Group); 346 }); 347 } 348 }; 349 350 /// Analyzes the pattern of the referenced variables in a statement. 351 class VariablePattern { 352 353 /// Describes an occurrence of a variable reference in a statement. 354 struct VariableOccurence { 355 /// The index of the associated VarDecl in the Variables vector. 356 size_t KindID; 357 /// The statement in the code where the variable was referenced. 358 const Stmt *Mention; 359 VariableOccurenceVariableOccurence360 VariableOccurence(size_t KindID, const Stmt *Mention) 361 : KindID(KindID), Mention(Mention) {} 362 }; 363 364 /// All occurrences of referenced variables in the order of appearance. 365 std::vector<VariableOccurence> Occurences; 366 /// List of referenced variables in the order of appearance. 367 /// Every item in this list is unique. 368 std::vector<const VarDecl *> Variables; 369 370 /// Adds a new variable referenced to this pattern. 371 /// \param VarDecl The declaration of the variable that is referenced. 372 /// \param Mention The SourceRange where this variable is referenced. 373 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); 374 375 /// Adds each referenced variable from the given statement. 376 void addVariables(const Stmt *S); 377 378 public: 379 /// Creates an VariablePattern object with information about the given 380 /// StmtSequence. VariablePattern(const StmtSequence & Sequence)381 VariablePattern(const StmtSequence &Sequence) { 382 for (const Stmt *S : Sequence) 383 addVariables(S); 384 } 385 386 /// Describes two clones that reference their variables in a different pattern 387 /// which could indicate a programming error. 388 struct SuspiciousClonePair { 389 /// Utility class holding the relevant information about a single 390 /// clone in this pair. 391 struct SuspiciousCloneInfo { 392 /// The variable which referencing in this clone was against the pattern. 393 const VarDecl *Variable; 394 /// Where the variable was referenced. 395 const Stmt *Mention; 396 /// The variable that should have been referenced to follow the pattern. 397 /// If Suggestion is a nullptr then it's not possible to fix the pattern 398 /// by referencing a different variable in this clone. 399 const VarDecl *Suggestion; SuspiciousCloneInfoSuspiciousClonePair::SuspiciousCloneInfo400 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, 401 const VarDecl *Suggestion) 402 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} SuspiciousCloneInfoSuspiciousClonePair::SuspiciousCloneInfo403 SuspiciousCloneInfo() {} 404 }; 405 /// The first clone in the pair which always has a suggested variable. 406 SuspiciousCloneInfo FirstCloneInfo; 407 /// This other clone in the pair which can have a suggested variable. 408 SuspiciousCloneInfo SecondCloneInfo; 409 }; 410 411 /// Counts the differences between this pattern and the given one. 412 /// \param Other The given VariablePattern to compare with. 413 /// \param FirstMismatch Output parameter that will be filled with information 414 /// about the first difference between the two patterns. This parameter 415 /// can be a nullptr, in which case it will be ignored. 416 /// \return Returns the number of differences between the pattern this object 417 /// is following and the given VariablePattern. 418 /// 419 /// For example, the following statements all have the same pattern and this 420 /// function would return zero: 421 /// 422 /// if (a < b) return a; return b; 423 /// if (x < y) return x; return y; 424 /// if (u2 < u1) return u2; return u1; 425 /// 426 /// But the following statement has a different pattern (note the changed 427 /// variables in the return statements) and would have two differences when 428 /// compared with one of the statements above. 429 /// 430 /// if (a < b) return b; return a; 431 /// 432 /// This function should only be called if the related statements of the given 433 /// pattern and the statements of this objects are clones of each other. 434 unsigned countPatternDifferences( 435 const VariablePattern &Other, 436 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); 437 }; 438 439 /// Ensures that all clones reference variables in the same pattern. 440 struct MatchingVariablePatternConstraint { 441 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); 442 }; 443 444 } // end namespace clang 445 446 #endif // LLVM_CLANG_AST_CLONEDETECTION_H 447