1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file defines classes for searching and analyzing source code clones.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
15 #define LLVM_CLANG_AST_CLONEDETECTION_H
16 
17 #include "clang/AST/StmtVisitor.h"
18 #include "llvm/Support/Regex.h"
19 #include <vector>
20 
21 namespace clang {
22 
23 class Stmt;
24 class Decl;
25 class VarDecl;
26 class ASTContext;
27 class CompoundStmt;
28 
29 /// Identifies a list of statements.
30 ///
31 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
32 /// child statements inside a CompoundStmt or no statements at all.
33 class StmtSequence {
34   /// If this object identifies a sequence of statements inside a CompoundStmt,
35   /// S points to this CompoundStmt. If this object only identifies a single
36   /// Stmt, then S is a pointer to this Stmt.
37   const Stmt *S;
38 
39   /// The declaration that contains the statements.
40   const Decl *D;
41 
42   /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
43   /// instance is representing the CompoundStmt children inside the array
44   /// [StartIndex, EndIndex).
45   unsigned StartIndex;
46   unsigned EndIndex;
47 
48 public:
49   /// Constructs a StmtSequence holding multiple statements.
50   ///
51   /// The resulting StmtSequence identifies a continuous sequence of statements
52   /// in the body of the given CompoundStmt. Which statements of the body should
53   /// be identified needs to be specified by providing a start and end index
54   /// that describe a non-empty sub-array in the body of the given CompoundStmt.
55   ///
56   /// \param Stmt A CompoundStmt that contains all statements in its body.
57   /// \param D The Decl containing this Stmt.
58   /// \param StartIndex The inclusive start index in the children array of
59   ///                   \p Stmt
60   /// \param EndIndex The exclusive end index in the children array of \p Stmt.
61   StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
62                unsigned EndIndex);
63 
64   /// Constructs a StmtSequence holding a single statement.
65   ///
66   /// \param Stmt An arbitrary Stmt.
67   /// \param D The Decl containing this Stmt.
68   StmtSequence(const Stmt *Stmt, const Decl *D);
69 
70   /// Constructs an empty StmtSequence.
71   StmtSequence();
72 
73   typedef const Stmt *const *iterator;
74 
75   /// Returns an iterator pointing to the first statement in this sequence.
76   iterator begin() const;
77 
78   /// Returns an iterator pointing behind the last statement in this sequence.
79   iterator end() const;
80 
81   /// Returns the first statement in this sequence.
82   ///
83   /// This method should only be called on a non-empty StmtSequence object.
front()84   const Stmt *front() const {
85     assert(!empty());
86     return begin()[0];
87   }
88 
89   /// Returns the last statement in this sequence.
90   ///
91   /// This method should only be called on a non-empty StmtSequence object.
back()92   const Stmt *back() const {
93     assert(!empty());
94     return begin()[size() - 1];
95   }
96 
97   /// Returns the number of statements this object holds.
size()98   unsigned size() const {
99     if (holdsSequence())
100       return EndIndex - StartIndex;
101     if (S == nullptr)
102       return 0;
103     return 1;
104   }
105 
106   /// Returns true if and only if this StmtSequence contains no statements.
empty()107   bool empty() const { return size() == 0; }
108 
109   /// Returns the related ASTContext for the stored Stmts.
110   ASTContext &getASTContext() const;
111 
112   /// Returns the declaration that contains the stored Stmts.
getContainingDecl()113   const Decl *getContainingDecl() const {
114     assert(D);
115     return D;
116   }
117 
118   /// Returns true if this objects holds a list of statements.
holdsSequence()119   bool holdsSequence() const { return EndIndex != 0; }
120 
121   /// Returns the start sourcelocation of the first statement in this sequence.
122   ///
123   /// This method should only be called on a non-empty StmtSequence object.
124   SourceLocation getBeginLoc() const;
125 
126   /// Returns the end sourcelocation of the last statement in this sequence.
127   ///
128   /// This method should only be called on a non-empty StmtSequence object.
129   SourceLocation getEndLoc() const;
130 
131   /// Returns the source range of the whole sequence - from the beginning
132   /// of the first statement to the end of the last statement.
133   SourceRange getSourceRange() const;
134 
135   bool operator==(const StmtSequence &Other) const {
136     return std::tie(S, StartIndex, EndIndex) ==
137            std::tie(Other.S, Other.StartIndex, Other.EndIndex);
138   }
139 
140   bool operator!=(const StmtSequence &Other) const {
141     return std::tie(S, StartIndex, EndIndex) !=
142            std::tie(Other.S, Other.StartIndex, Other.EndIndex);
143   }
144 
145   /// Returns true if and only if this sequence covers a source range that
146   /// contains the source range of the given sequence \p Other.
147   ///
148   /// This method should only be called on a non-empty StmtSequence object
149   /// and passed a non-empty StmtSequence object.
150   bool contains(const StmtSequence &Other) const;
151 };
152 
153 /// Searches for similar subtrees in the AST.
154 ///
155 /// First, this class needs several declarations with statement bodies which
156 /// can be passed via analyzeCodeBody. Afterwards all statements can be
157 /// searched for clones by calling findClones with a given list of constraints
158 /// that should specify the wanted properties of the clones.
159 ///
160 /// The result of findClones can be further constrained with the constrainClones
161 /// method.
162 ///
163 /// This class only searches for clones in executable source code
164 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
165 /// are not supported.
166 class CloneDetector {
167 
168 public:
169   /// A collection of StmtSequences that share an arbitrary property.
170   typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
171 
172   /// Generates and stores search data for all statements in the body of
173   /// the given Decl.
174   void analyzeCodeBody(const Decl *D);
175 
176   /// Constrains the given list of clone groups with the given constraint.
177   ///
178   /// The constraint is expected to have a method with the signature
179   ///     `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
180   /// as this is the interface that the CloneDetector uses for applying the
181   /// constraint. The constraint is supposed to directly modify the passed list
182   /// so that all clones in the list fulfill the specific property this
183   /// constraint ensures.
184   template <typename T>
constrainClones(std::vector<CloneGroup> & CloneGroups,T C)185   static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
186     C.constrain(CloneGroups);
187   }
188 
189   /// Constrains the given list of clone groups with the given list of
190   /// constraints.
191   ///
192   /// The constraints are applied in sequence in the order in which they are
193   /// passed to this function.
194   template <typename T1, typename... Ts>
constrainClones(std::vector<CloneGroup> & CloneGroups,T1 C,Ts...ConstraintList)195   static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
196                               Ts... ConstraintList) {
197     constrainClones(CloneGroups, C);
198     constrainClones(CloneGroups, ConstraintList...);
199   }
200 
201   /// Searches for clones in all previously passed statements.
202   /// \param Result Output parameter to which all created clone groups are
203   ///               added.
204   /// \param ConstraintList The constraints that should be applied to the
205   //         result.
206   template <typename... Ts>
findClones(std::vector<CloneGroup> & Result,Ts...ConstraintList)207   void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
208     // The initial assumption is that there is only one clone group and every
209     // statement is a clone of the others. This clone group will then be
210     // split up with the help of the constraints.
211     CloneGroup AllClones;
212     AllClones.reserve(Sequences.size());
213     for (const auto &C : Sequences) {
214       AllClones.push_back(C);
215     }
216 
217     Result.push_back(AllClones);
218 
219     constrainClones(Result, ConstraintList...);
220   }
221 
222 private:
223   CloneGroup Sequences;
224 };
225 
226 /// This class is a utility class that contains utility functions for building
227 /// custom constraints.
228 class CloneConstraint {
229 public:
230   /// Removes all groups by using a filter function.
231   /// \param CloneGroups The list of CloneGroups that is supposed to be
232   ///                    filtered.
233   /// \param Filter The filter function that should return true for all groups
234   ///               that should be removed from the list.
filterGroups(std::vector<CloneDetector::CloneGroup> & CloneGroups,llvm::function_ref<bool (const CloneDetector::CloneGroup &)> Filter)235   static void filterGroups(
236       std::vector<CloneDetector::CloneGroup> &CloneGroups,
237       llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
238     CloneGroups.erase(
239         std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
240         CloneGroups.end());
241   }
242 
243   /// Splits the given CloneGroups until the given Compare function returns true
244   /// for all clones in a single group.
245   /// \param CloneGroups A list of CloneGroups that should be modified.
246   /// \param Compare The comparison function that all clones are supposed to
247   ///                pass. Should return true if and only if two clones belong
248   ///                to the same CloneGroup.
249   static void splitCloneGroups(
250       std::vector<CloneDetector::CloneGroup> &CloneGroups,
251       llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
252           Compare);
253 };
254 
255 /// This constraint moves clones into clone groups of type II via hashing.
256 ///
257 /// Clones with different hash values are moved into separate clone groups.
258 /// Collisions are possible, and this constraint does nothing to address this
259 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
260 /// constraint chain, not necessarily immediately, to eliminate hash collisions
261 /// through a more detailed analysis.
262 class RecursiveCloneTypeIIHashConstraint {
263 public:
264   void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
265 };
266 
267 /// This constraint moves clones into clone groups of type II by comparing them.
268 ///
269 /// Clones that aren't type II clones are moved into separate clone groups.
270 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
271 /// group are guaranteed to be be type II clones of each other, but it is too
272 /// slow to efficiently handle large amounts of clones.
273 class RecursiveCloneTypeIIVerifyConstraint {
274 public:
275   void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
276 };
277 
278 /// Ensures that every clone has at least the given complexity.
279 ///
280 /// Complexity is here defined as the total amount of children of a statement.
281 /// This constraint assumes the first statement in the group is representative
282 /// for all other statements in the group in terms of complexity.
283 class MinComplexityConstraint {
284   unsigned MinComplexity;
285 
286 public:
MinComplexityConstraint(unsigned MinComplexity)287   MinComplexityConstraint(unsigned MinComplexity)
288       : MinComplexity(MinComplexity) {}
289 
290   /// Calculates the complexity of the given StmtSequence.
291   /// \param Limit The limit of complexity we probe for. After reaching
292   ///              this limit during calculation, this method is exiting
293   ///              early to improve performance and returns this limit.
294   size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
295                                  const std::string &ParentMacroStack = "");
296 
constrain(std::vector<CloneDetector::CloneGroup> & CloneGroups)297   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
298     CloneConstraint::filterGroups(
299         CloneGroups, [this](const CloneDetector::CloneGroup &A) {
300           if (!A.empty())
301             return calculateStmtComplexity(A.front(), MinComplexity) <
302                    MinComplexity;
303           else
304             return false;
305         });
306   }
307 };
308 
309 /// Ensures that all clone groups contain at least the given amount of clones.
310 class MinGroupSizeConstraint {
311   unsigned MinGroupSize;
312 
313 public:
314   MinGroupSizeConstraint(unsigned MinGroupSize = 2)
MinGroupSize(MinGroupSize)315       : MinGroupSize(MinGroupSize) {}
316 
constrain(std::vector<CloneDetector::CloneGroup> & CloneGroups)317   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
318     CloneConstraint::filterGroups(CloneGroups,
319                                   [this](const CloneDetector::CloneGroup &A) {
320                                     return A.size() < MinGroupSize;
321                                   });
322   }
323 };
324 
325 /// Ensures that no clone group fully contains another clone group.
326 struct OnlyLargestCloneConstraint {
327   void constrain(std::vector<CloneDetector::CloneGroup> &Result);
328 };
329 
330 struct FilenamePatternConstraint {
331   StringRef IgnoredFilesPattern;
332   std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
333 
FilenamePatternConstraintFilenamePatternConstraint334   FilenamePatternConstraint(StringRef IgnoredFilesPattern)
335       : IgnoredFilesPattern(IgnoredFilesPattern) {
336     IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
337         IgnoredFilesPattern.str() + "$)");
338   }
339 
340   bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
341 
constrainFilenamePatternConstraint342   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
343     CloneConstraint::filterGroups(
344         CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
345           return isAutoGenerated(Group);
346         });
347   }
348 };
349 
350 /// Analyzes the pattern of the referenced variables in a statement.
351 class VariablePattern {
352 
353   /// Describes an occurrence of a variable reference in a statement.
354   struct VariableOccurence {
355     /// The index of the associated VarDecl in the Variables vector.
356     size_t KindID;
357     /// The statement in the code where the variable was referenced.
358     const Stmt *Mention;
359 
VariableOccurenceVariableOccurence360     VariableOccurence(size_t KindID, const Stmt *Mention)
361         : KindID(KindID), Mention(Mention) {}
362   };
363 
364   /// All occurrences of referenced variables in the order of appearance.
365   std::vector<VariableOccurence> Occurences;
366   /// List of referenced variables in the order of appearance.
367   /// Every item in this list is unique.
368   std::vector<const VarDecl *> Variables;
369 
370   /// Adds a new variable referenced to this pattern.
371   /// \param VarDecl The declaration of the variable that is referenced.
372   /// \param Mention The SourceRange where this variable is referenced.
373   void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
374 
375   /// Adds each referenced variable from the given statement.
376   void addVariables(const Stmt *S);
377 
378 public:
379   /// Creates an VariablePattern object with information about the given
380   /// StmtSequence.
VariablePattern(const StmtSequence & Sequence)381   VariablePattern(const StmtSequence &Sequence) {
382     for (const Stmt *S : Sequence)
383       addVariables(S);
384   }
385 
386   /// Describes two clones that reference their variables in a different pattern
387   /// which could indicate a programming error.
388   struct SuspiciousClonePair {
389     /// Utility class holding the relevant information about a single
390     /// clone in this pair.
391     struct SuspiciousCloneInfo {
392       /// The variable which referencing in this clone was against the pattern.
393       const VarDecl *Variable;
394       /// Where the variable was referenced.
395       const Stmt *Mention;
396       /// The variable that should have been referenced to follow the pattern.
397       /// If Suggestion is a nullptr then it's not possible to fix the pattern
398       /// by referencing a different variable in this clone.
399       const VarDecl *Suggestion;
SuspiciousCloneInfoSuspiciousClonePair::SuspiciousCloneInfo400       SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
401                           const VarDecl *Suggestion)
402           : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
SuspiciousCloneInfoSuspiciousClonePair::SuspiciousCloneInfo403       SuspiciousCloneInfo() {}
404     };
405     /// The first clone in the pair which always has a suggested variable.
406     SuspiciousCloneInfo FirstCloneInfo;
407     /// This other clone in the pair which can have a suggested variable.
408     SuspiciousCloneInfo SecondCloneInfo;
409   };
410 
411   /// Counts the differences between this pattern and the given one.
412   /// \param Other The given VariablePattern to compare with.
413   /// \param FirstMismatch Output parameter that will be filled with information
414   ///        about the first difference between the two patterns. This parameter
415   ///        can be a nullptr, in which case it will be ignored.
416   /// \return Returns the number of differences between the pattern this object
417   ///         is following and the given VariablePattern.
418   ///
419   /// For example, the following statements all have the same pattern and this
420   /// function would return zero:
421   ///
422   ///   if (a < b) return a; return b;
423   ///   if (x < y) return x; return y;
424   ///   if (u2 < u1) return u2; return u1;
425   ///
426   /// But the following statement has a different pattern (note the changed
427   /// variables in the return statements) and would have two differences when
428   /// compared with one of the statements above.
429   ///
430   ///   if (a < b) return b; return a;
431   ///
432   /// This function should only be called if the related statements of the given
433   /// pattern and the statements of this objects are clones of each other.
434   unsigned countPatternDifferences(
435       const VariablePattern &Other,
436       VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
437 };
438 
439 /// Ensures that all clones reference variables in the same pattern.
440 struct MatchingVariablePatternConstraint {
441   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
442 };
443 
444 } // end namespace clang
445 
446 #endif // LLVM_CLANG_AST_CLONEDETECTION_H
447