1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Interface file for the IRSimilarityIdentifier for identifying similarities in
11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned
12 // integers.
13 //
14 // Two sequences of instructions are called "similar" if they perform the same
15 // series of operations for all inputs.
16 //
17 // \code
18 // %1 = add i32 %a, 10
19 // %2 = add i32 %a, %1
20 // %3 = icmp slt icmp %1, %2
21 // \endcode
22 //
23 // and
24 //
25 // \code
26 // %1 = add i32 11, %a
27 // %2 = sub i32 %a, %1
28 // %3 = icmp sgt icmp %2, %1
29 // \endcode
30 //
31 // ultimately have the same result, even if the inputs, and structure are
32 // slightly different.
33 //
34 // For instructions, we do not worry about operands that do not have fixed
35 // semantic meaning to the program.  We consider the opcode that the instruction
36 // has, the types, parameters, and extra information such as the function name,
37 // or comparison predicate.  These are used to create a hash to map instructions
38 // to integers to be used in similarity matching in sequences of instructions
39 //
40 // Terminology:
41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
42 // Instructions), usually used to denote a region of similarity has been found.
43 //
44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
45 // similar to one another.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
51 
52 #include "llvm/IR/InstVisitor.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/Module.h"
55 #include "llvm/IR/PassManager.h"
56 #include "llvm/Pass.h"
57 #include "llvm/Support/Allocator.h"
58 
59 namespace llvm {
60 namespace IRSimilarity {
61 
62 struct IRInstructionDataList;
63 
64 /// This represents what is and is not supported when finding similarity in
65 /// Instructions.
66 ///
67 /// Legal Instructions are considered when looking at similarity between
68 /// Instructions.
69 ///
70 /// Illegal Instructions cannot be considered when looking for similarity
71 /// between Instructions. They act as boundaries between similarity regions.
72 ///
73 /// Invisible Instructions are skipped over during analysis.
74 // TODO: Shared with MachineOutliner
75 enum InstrType { Legal, Illegal, Invisible };
76 
77 /// This provides the utilities for hashing an Instruction to an unsigned
78 /// integer. Two IRInstructionDatas produce the same hash value when their
79 /// underlying Instructions perform the same operation (even if they don't have
80 /// the same input operands.)
81 /// As a more concrete example, consider the following:
82 ///
83 /// \code
84 /// %add1 = add i32 %a, %b
85 /// %add2 = add i32 %c, %d
86 /// %add3 = add i64 %e, %f
87 /// \endcode
88 ///
89 // Then the IRInstructionData wrappers for these Instructions may be hashed like
90 /// so:
91 ///
92 /// \code
93 /// ; These two adds have the same types and operand types, so they hash to the
94 /// ; same number.
95 /// %add1 = add i32 %a, %b ; Hash: 1
96 /// %add2 = add i32 %c, %d ; Hash: 1
97 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
98 /// ; it hashes to a different number.
99 /// %add3 = add i64 %e, %f; Hash: 2
100 /// \endcode
101 ///
102 ///
103 /// This hashing scheme will be used to represent the program as a very long
104 /// string. This string can then be placed in a data structure which can be used
105 /// for similarity queries.
106 ///
107 /// TODO: Handle types of Instructions which can be equal even with different
108 /// operands. (E.g. comparisons with swapped predicates.)
109 /// TODO: Handle CallInsts, which are only checked for function type
110 /// by \ref isSameOperationAs.
111 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
112 /// exact same, and some do not.
113 struct IRInstructionData : ilist_node<IRInstructionData> {
114 
115   /// The source Instruction that is being wrapped.
116   Instruction *Inst = nullptr;
117   /// The values of the operands in the Instruction.
118   SmallVector<Value *, 4> OperVals;
119   /// The legality of the wrapped instruction. This is informed by InstrType,
120   /// and is used when checking when two instructions are considered similar.
121   /// If either instruction is not legal, the instructions are automatically not
122   /// considered similar.
123   bool Legal;
124 
125   /// Gather the information that is difficult to gather for an Instruction, or
126   /// is changed. i.e. the operands of an Instruction and the Types of those
127   /// operands. This extra information allows for similarity matching to make
128   /// assertions that allow for more flexibility when checking for whether an
129   /// Instruction performs the same operation.
130   IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
131 
132   /// Hashes \p Value based on its opcode, types, and operand types.
133   /// Two IRInstructionData instances produce the same hash when they perform
134   /// the same operation.
135   ///
136   /// As a simple example, consider the following instructions.
137   ///
138   /// \code
139   /// %add1 = add i32 %x1, %y1
140   /// %add2 = add i32 %x2, %y2
141   ///
142   /// %sub = sub i32 %x1, %y1
143   ///
144   /// %add_i64 = add i64 %x2, %y2
145   /// \endcode
146   ///
147   /// Because the first two adds operate the same types, and are performing the
148   /// same action, they will be hashed to the same value.
149   ///
150   /// However, the subtraction instruction is not the same as an addition, and
151   /// will be hashed to a different value.
152   ///
153   /// Finally, the last add has a different type compared to the first two add
154   /// instructions, so it will also be hashed to a different value that any of
155   /// the previous instructions.
156   ///
157   /// \param [in] ID - The IRInstructionData instance to be hashed.
158   /// \returns A hash_value of the IRInstructionData.
hash_valueIRInstructionData159   friend hash_code hash_value(const IRInstructionData &ID) {
160     SmallVector<Type *, 4> OperTypes;
161     for (Value *V : ID.OperVals)
162       OperTypes.push_back(V->getType());
163 
164     return llvm::hash_combine(
165         llvm::hash_value(ID.Inst->getOpcode()),
166         llvm::hash_value(ID.Inst->getType()),
167         llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
168   }
169 
170   IRInstructionDataList *IDL = nullptr;
171 };
172 
173 struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
174 
175 /// Compare one IRInstructionData class to another IRInstructionData class for
176 /// whether they are performing a the same operation, and can mapped to the
177 /// same value. For regular instructions if the hash value is the same, then
178 /// they will also be close.
179 ///
180 /// \param A - The first IRInstructionData class to compare
181 /// \param B - The second IRInstructionData class to compare
182 /// \returns true if \p A and \p B are similar enough to be mapped to the same
183 /// value.
184 bool isClose(const IRInstructionData &A, const IRInstructionData &B);
185 
186 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
getEmptyKeyIRInstructionDataTraits187   static inline IRInstructionData *getEmptyKey() { return nullptr; }
getTombstoneKeyIRInstructionDataTraits188   static inline IRInstructionData *getTombstoneKey() {
189     return reinterpret_cast<IRInstructionData *>(-1);
190   }
191 
getHashValueIRInstructionDataTraits192   static unsigned getHashValue(const IRInstructionData *E) {
193     using llvm::hash_value;
194     assert(E && "IRInstructionData is a nullptr?");
195     return hash_value(*E);
196   }
197 
isEqualIRInstructionDataTraits198   static bool isEqual(const IRInstructionData *LHS,
199                       const IRInstructionData *RHS) {
200     if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
201         LHS == getEmptyKey() || LHS == getTombstoneKey())
202       return LHS == RHS;
203 
204     assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
205     return isClose(*LHS, *RHS);
206   }
207 };
208 
209 /// Helper struct for converting the Instructions in a Module into a vector of
210 /// unsigned integers. This vector of unsigned integers can be thought of as a
211 /// "numeric string". This numeric string can then be queried by, for example,
212 /// data structures that find repeated substrings.
213 ///
214 /// This hashing is done per BasicBlock in the module. To hash Instructions
215 /// based off of their operations, each Instruction is wrapped in an
216 /// IRInstructionData struct. The unsigned integer for an IRInstructionData
217 /// depends on:
218 /// - The hash provided by the IRInstructionData.
219 /// - Which member of InstrType the IRInstructionData is classified as.
220 // See InstrType for more details on the possible classifications, and how they
221 // manifest in the numeric string.
222 ///
223 /// The numeric string for an individual BasicBlock is terminated by an unique
224 /// unsigned integer. This prevents data structures which rely on repetition
225 /// from matching across BasicBlocks. (For example, the SuffixTree.)
226 /// As a concrete example, if we have the following two BasicBlocks:
227 /// \code
228 /// bb0:
229 /// %add1 = add i32 %a, %b
230 /// %add2 = add i32 %c, %d
231 /// %add3 = add i64 %e, %f
232 /// bb1:
233 /// %sub = sub i32 %c, %d
234 /// \endcode
235 /// We may hash the Instructions like this (via IRInstructionData):
236 /// \code
237 /// bb0:
238 /// %add1 = add i32 %a, %b ; Hash: 1
239 /// %add2 = add i32 %c, %d; Hash: 1
240 /// %add3 = add i64 %e, %f; Hash: 2
241 /// bb1:
242 /// %sub = sub i32 %c, %d; Hash: 3
243 /// %add4 = add i32 %c, %d ; Hash: 1
244 /// \endcode
245 /// And produce a "numeric string representation" like so:
246 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
247 ///
248 /// TODO: This is very similar to the MachineOutliner, and should be
249 /// consolidated into the same interface.
250 struct IRInstructionMapper {
251   /// The starting illegal instruction number to map to.
252   ///
253   /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
254   unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
255 
256   /// The next available integer to assign to a legal Instruction to.
257   unsigned LegalInstrNumber = 0;
258 
259   /// Correspondence from IRInstructionData to unsigned integers.
260   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
261       InstructionIntegerMap;
262 
263   /// Set if we added an illegal number in the previous step.
264   /// Since each illegal number is unique, we only need one of them between
265   /// each range of legal numbers. This lets us make sure we don't add more
266   /// than one illegal number per range.
267   bool AddedIllegalLastTime = false;
268 
269   /// Marks whether we found a illegal instruction in the previous step.
270   bool CanCombineWithPrevInstr = false;
271 
272   /// Marks whether we have found a set of instructions that is long enough
273   /// to be considered for similarity.
274   bool HaveLegalRange = false;
275 
276   /// This allocator pointer is in charge of holding on to the IRInstructionData
277   /// so it is not deallocated until whatever external tool is using it is done
278   /// with the information.
279   SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
280 
281   /// This allocator pointer is in charge of creating the IRInstructionDataList
282   /// so it is not deallocated until whatever external tool is using it is done
283   /// with the information.
284   SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
285 
286   /// Get an allocated IRInstructionData struct using the InstDataAllocator.
287   ///
288   /// \param I - The Instruction to wrap with IRInstructionData.
289   /// \param Legality - A boolean value that is true if the instruction is to
290   /// be considered for similarity, and false if not.
291   /// \param IDL - The InstructionDataList that the IRInstructionData is
292   /// inserted into.
293   /// \returns An allocated IRInstructionData struct.
294   IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
295                                                IRInstructionDataList &IDL);
296 
297   /// Get an allocated IRInstructionDataList object using the IDLAllocator.
298   ///
299   /// \returns An allocated IRInstructionDataList object.
300   IRInstructionDataList *allocateIRInstructionDataList();
301 
302   IRInstructionDataList *IDL = nullptr;
303 
304   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
305   /// determined by \p InstrType. Two Instructions are mapped to the same value
306   /// if they are close as defined by the InstructionData class above.
307   ///
308   /// \param [in] BB - The BasicBlock to be mapped to integers.
309   /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
310   /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
311   void convertToUnsignedVec(BasicBlock &BB,
312                             std::vector<IRInstructionData *> &InstrList,
313                             std::vector<unsigned> &IntegerMapping);
314 
315   /// Maps an Instruction to a legal integer.
316   ///
317   /// \param [in] It - The Instruction to be mapped to an integer.
318   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
319   /// append to.
320   /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
321   /// \returns The integer \p It was mapped to.
322   unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
323                               std::vector<unsigned> &IntegerMappingForBB,
324                               std::vector<IRInstructionData *> &InstrListForBB);
325 
326   /// Maps an Instruction to an illegal integer.
327   ///
328   /// \param [in] It - The \p Instruction to be mapped to an integer.
329   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
330   /// append to.
331   /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
332   /// \param End - true if creating a dummy IRInstructionData at the end of a
333   /// basic block.
334   /// \returns The integer \p It was mapped to.
335   unsigned mapToIllegalUnsigned(
336       BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
337       std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
338 
IRInstructionMapperIRInstructionMapper339   IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
340                       SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
341       : InstDataAllocator(IDA), IDLAllocator(IDLA) {
342     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
343     // changed.
344     assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
345            "DenseMapInfo<unsigned>'s empty key isn't -1!");
346     assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
347                static_cast<unsigned>(-2) &&
348            "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
349 
350     IDL = new (IDLAllocator->Allocate())
351         IRInstructionDataList();
352   }
353 
354   /// Custom InstVisitor to classify different instructions for whether it can
355   /// be analyzed for similarity.
356   struct InstructionClassification
357       : public InstVisitor<InstructionClassification, InstrType> {
InstructionClassificationIRInstructionMapper::InstructionClassification358     InstructionClassification() {}
359 
360     // TODO: Determine a scheme to resolve when the label is similar enough.
visitBranchInstIRInstructionMapper::InstructionClassification361     InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
362     // TODO: Determine a scheme to resolve when the labels are similar enough.
visitPHINodeIRInstructionMapper::InstructionClassification363     InstrType visitPHINode(PHINode &PN) { return Illegal; }
364     // TODO: Handle allocas.
visitAllocaInstIRInstructionMapper::InstructionClassification365     InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
366     // We exclude variable argument instructions since variable arguments
367     // requires extra checking of the argument list.
visitVAArgInstIRInstructionMapper::InstructionClassification368     InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
369     // We exclude all exception handling cases since they are so context
370     // dependent.
visitLandingPadInstIRInstructionMapper::InstructionClassification371     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
visitFuncletPadInstIRInstructionMapper::InstructionClassification372     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
373     // DebugInfo should be included in the regions, but should not be
374     // analyzed for similarity as it has no bearing on the outcome of the
375     // program.
visitDbgInfoIntrinsicIRInstructionMapper::InstructionClassification376     InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
377     // TODO: Handle GetElementPtrInsts
visitGetElementPtrInstIRInstructionMapper::InstructionClassification378     InstrType visitGetElementPtrInst(GetElementPtrInst &GEPI) {
379       return Illegal;
380     }
381     // TODO: Handle specific intrinsics.
visitIntrinsicInstIRInstructionMapper::InstructionClassification382     InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
383     // TODO: Handle CallInsts.
visitCallInstIRInstructionMapper::InstructionClassification384     InstrType visitCallInst(CallInst &CI) { return Illegal; }
385     // TODO: We do not current handle similarity that changes the control flow.
visitInvokeInstIRInstructionMapper::InstructionClassification386     InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
387     // TODO: We do not current handle similarity that changes the control flow.
visitCallBrInstIRInstructionMapper::InstructionClassification388     InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
389     // TODO: Handle interblock similarity.
visitTerminatorIRInstructionMapper::InstructionClassification390     InstrType visitTerminator(Instruction &I) { return Illegal; }
visitInstructionIRInstructionMapper::InstructionClassification391     InstrType visitInstruction(Instruction &I) { return Legal; }
392   };
393 
394   /// Maps an Instruction to a member of InstrType.
395   InstructionClassification InstClassifier;
396 };
397 
398 /// This is a class that wraps a range of IRInstructionData from one point to
399 /// another in the vector of IRInstructionData, which is a region of the
400 /// program.  It is also responsible for defining the structure within this
401 /// region of instructions.
402 ///
403 /// The structure of a region is defined through a value numbering system
404 /// assigned to each unique value in a region at the creation of the
405 /// IRSimilarityCandidate.
406 ///
407 /// For example, for each Instruction we add a mapping for each new
408 /// value seen in that Instruction.
409 /// IR:                    Mapping Added:
410 /// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
411 /// %add2 = add i32 %a, %1    %add2 -> 4
412 /// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
413 ///
414 /// We can compare IRSimilarityCandidates against one another.
415 /// The \ref isSimilar function compares each IRInstructionData against one
416 /// another and if we have the same sequences of IRInstructionData that would
417 /// create the same hash, we have similar IRSimilarityCandidates.
418 ///
419 /// We can also compare the structure of IRSimilarityCandidates. If we can
420 /// create a mapping of registers in the region contained by one
421 /// IRSimilarityCandidate to the region contained by different
422 /// IRSimilarityCandidate, they can be considered structurally similar.
423 ///
424 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
425 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
426 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
427 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
428 ///
429 /// Can have the following mapping from candidate to candidate of:
430 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
431 /// and can be considered similar.
432 ///
433 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
434 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
435 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
436 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
437 ///
438 /// We cannot create the same mapping since the use of c4 is not used in the
439 /// same way as %b or c2.
440 class IRSimilarityCandidate {
441 private:
442   /// The start index of this IRSimilarityCandidate in the instruction list.
443   unsigned StartIdx = 0;
444 
445   /// The number of instructions in this IRSimilarityCandidate.
446   unsigned Len = 0;
447 
448   /// The first instruction in this IRSimilarityCandidate.
449   IRInstructionData *FirstInst = nullptr;
450 
451   /// The last instruction in this IRSimilarityCandidate.
452   IRInstructionData *LastInst = nullptr;
453 
454   /// Global Value Numbering structures
455   /// @{
456   /// Stores the mapping of the value to the number assigned to it in the
457   /// IRSimilarityCandidate.
458   DenseMap<Value *, unsigned> ValueToNumber;
459   /// Stores the mapping of the number to the value assigned this number.
460   DenseMap<unsigned, Value *> NumberToValue;
461   /// @}
462 
463 public:
464   /// \param StartIdx - The starting location of the region.
465   /// \param Len - The length of the region.
466   /// \param FirstInstIt - The starting IRInstructionData of the region.
467   /// \param LastInstIt - The ending IRInstructionData of the region.
468   IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
469                         IRInstructionData *FirstInstIt,
470                         IRInstructionData *LastInstIt);
471 
472   /// \param A - The first IRInstructionCandidate to compare.
473   /// \param B - The second IRInstructionCandidate to compare.
474   /// \returns True when every IRInstructionData in \p A is similar to every
475   /// IRInstructionData in \p B.
476   static bool isSimilar(const IRSimilarityCandidate &A,
477                         const IRSimilarityCandidate &B);
478 
479   /// \param A - The first IRInstructionCandidate to compare.
480   /// \param B - The second IRInstructionCandidate to compare.
481   /// \returns True when every IRInstructionData in \p A is structurally similar
482   /// to \p B.
483   static bool compareStructure(const IRSimilarityCandidate &A,
484                                const IRSimilarityCandidate &B);
485 
486   struct OperandMapping {
487     /// The IRSimilarityCandidate that holds the instruction the OperVals were
488     /// pulled from.
489     const IRSimilarityCandidate &IRSC;
490 
491     /// The operand values to be analyzed.
492     ArrayRef<Value *> &OperVals;
493 
494     /// The current mapping of global value numbers from one IRSimilarityCandidate
495     /// to another IRSimilarityCandidate.
496     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
497   };
498 
499   /// Compare the operands in \p A and \p B and check that the current mapping
500   /// of global value numbers from \p A to \p B and \p B to \A is consistent.
501   ///
502   /// \param A - The first IRInstructionCandidate, operand values, and current
503   /// operand mappings to compare.
504   /// \param B - The second IRInstructionCandidate, operand values, and current
505   /// operand mappings to compare.
506   /// \returns true if the IRSimilarityCandidates operands are compatible.
507   static bool compareOperandMapping(OperandMapping A, OperandMapping B);
508 
509   /// Compare the start and end indices of the two IRSimilarityCandidates for
510   /// whether they overlap. If the start instruction of one
511   /// IRSimilarityCandidate is less than the end instruction of the other, and
512   /// the start instruction of one is greater than the start instruction of the
513   /// other, they overlap.
514   ///
515   /// \returns true if the IRSimilarityCandidates do not have overlapping
516   /// instructions.
517   static bool overlap(const IRSimilarityCandidate &A,
518                       const IRSimilarityCandidate &B);
519 
520   /// \returns the number of instructions in this Candidate.
getLength()521   unsigned getLength() const { return Len; }
522 
523   /// \returns the start index of this IRSimilarityCandidate.
getStartIdx()524   unsigned getStartIdx() const { return StartIdx; }
525 
526   /// \returns the end index of this IRSimilarityCandidate.
getEndIdx()527   unsigned getEndIdx() const { return StartIdx + Len - 1; }
528 
529   /// \returns The first IRInstructionData.
front()530   IRInstructionData *front() const { return FirstInst; }
531   /// \returns The last IRInstructionData.
back()532   IRInstructionData *back() const { return LastInst; }
533 
534   /// \returns The first Instruction.
frontInstruction()535   Instruction *frontInstruction() { return FirstInst->Inst; }
536   /// \returns The last Instruction
backInstruction()537   Instruction *backInstruction() { return LastInst->Inst; }
538 
539   /// \returns The BasicBlock the IRSimilarityCandidate starts in.
getStartBB()540   BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
541   /// \returns The BasicBlock the IRSimilarityCandidate ends in.
getEndBB()542   BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
543 
544   /// \returns The Function that the IRSimilarityCandidate is located in.
getFunction()545   Function *getFunction() { return getStartBB()->getParent(); }
546 
547   /// Finds the positive number associated with \p V if it has been mapped.
548   /// \param [in] V - the Value to find.
549   /// \returns The positive number corresponding to the value.
550   /// \returns None if not present.
getGVN(Value * V)551   Optional<unsigned> getGVN(Value *V) {
552     assert(V != nullptr && "Value is a nullptr?");
553     DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
554     if (VNIt == ValueToNumber.end())
555       return None;
556     return VNIt->second;
557   }
558 
559   /// Finds the Value associate with \p Num if it exists.
560   /// \param [in] Num - the number to find.
561   /// \returns The Value associated with the number.
562   /// \returns None if not present.
fromGVN(unsigned Num)563   Optional<Value *> fromGVN(unsigned Num) {
564     DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
565     if (VNIt == NumberToValue.end())
566       return None;
567     assert(VNIt->second != nullptr && "Found value is a nullptr!");
568     return VNIt->second;
569   }
570 
571   /// \param RHS -The IRSimilarityCandidate to compare against
572   /// \returns true if the IRSimilarityCandidate is occurs after the
573   /// IRSimilarityCandidate in the program.
574   bool operator<(const IRSimilarityCandidate &RHS) const {
575     return getStartIdx() > RHS.getStartIdx();
576   }
577 
578   using iterator = IRInstructionDataList::iterator;
begin()579   iterator begin() const { return iterator(front()); }
end()580   iterator end() const { return std::next(iterator(back())); }
581 };
582 
583 typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
584 typedef std::vector<SimilarityGroup> SimilarityGroupList;
585 
586 /// This class puts all the pieces of the IRInstructionData,
587 /// IRInstructionMapper, IRSimilarityCandidate together.
588 ///
589 /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
590 /// and puts all the mapped instructions into a single long list of
591 /// IRInstructionData.
592 ///
593 /// The list of unsigned integers is given to the Suffix Tree or similar data
594 /// structure to find repeated subsequences.  We construct an
595 /// IRSimilarityCandidate for each instance of the subsequence.  We compare them
596 /// against one another since  These repeated subsequences can have different
597 /// structure.  For each different kind of structure found, we create a
598 /// similarity group.
599 ///
600 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
601 /// structurally similar to one another, while C is different we would have two
602 /// SimilarityGroups:
603 ///
604 /// SimilarityGroup 1:  SimilarityGroup 2
605 /// A, B, D             C
606 ///
607 /// A list of the different similarity groups is then returned after
608 /// analyzing the module.
609 class IRSimilarityIdentifier {
610 public:
IRSimilarityIdentifier()611   IRSimilarityIdentifier()
612       : Mapper(&InstDataAllocator, &InstDataListAllocator) {}
613 
614   /// \param M the module to find similarity in.
IRSimilarityIdentifier(Module & M)615   explicit IRSimilarityIdentifier(Module &M)
616       : Mapper(&InstDataAllocator, &InstDataListAllocator) {
617     findSimilarity(M);
618   }
619 
620 private:
621   /// Map the instructions in the module to unsigned integers, using mapping
622   /// already present in the Mapper if possible.
623   ///
624   /// \param [in] M Module - To map to integers.
625   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
626   /// \param [in,out] IntegerMapping - The vector to append integers to.
627   void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
628                       std::vector<unsigned> &IntegerMapping);
629 
630   /// Map the instructions in the modules vector to unsigned integers, using
631   /// mapping already present in the mapper if possible.
632   ///
633   /// \param [in] Modules - The list of modules to use to populate the mapper
634   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
635   /// \param [in,out] IntegerMapping - The vector to append integers to.
636   void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
637                       std::vector<IRInstructionData *> &InstrList,
638                       std::vector<unsigned> &IntegerMapping);
639 
640   /// Find the similarity candidates in \p InstrList and corresponding
641   /// \p UnsignedVec
642   ///
643   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
644   /// \param [in,out] IntegerMapping - The vector to append integers to.
645   /// candidates found in the program.
646   void findCandidates(std::vector<IRInstructionData *> &InstrList,
647                       std::vector<unsigned> &IntegerMapping);
648 
649 public:
650   // Find the IRSimilarityCandidates in the \p Modules and group by structural
651   // similarity in a SimilarityGroup, each group is returned in a
652   // SimilarityGroupList.
653   //
654   // \param [in] Modules - the modules to analyze.
655   // \returns The groups of similarity ranges found in the modules.
656   SimilarityGroupList &
657   findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
658 
659   // Find the IRSimilarityCandidates in the given Module grouped by structural
660   // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
661   //
662   // \param [in] M - the module to analyze.
663   // \returns The groups of similarity ranges found in the module.
664   SimilarityGroupList &findSimilarity(Module &M);
665 
666   // Clears \ref SimilarityCandidates if it is already filled by a previous run.
resetSimilarityCandidates()667   void resetSimilarityCandidates() {
668     // If we've already analyzed a Module or set of Modules, so we must clear
669     // the SimilarityCandidates to make sure we do not have only old values
670     // hanging around.
671     if (SimilarityCandidates.hasValue())
672       SimilarityCandidates->clear();
673     else
674       SimilarityCandidates = SimilarityGroupList();
675   }
676 
677   // \returns The groups of similarity ranges found in the most recently passed
678   // set of modules.
getSimilarity()679   Optional<SimilarityGroupList> &getSimilarity() {
680     return SimilarityCandidates;
681   }
682 
683 private:
684   /// The allocator for IRInstructionData.
685   SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
686 
687   /// The allocator for IRInstructionDataLists.
688   SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
689 
690   /// Map Instructions to unsigned integers and wraps the Instruction in an
691   /// instance of IRInstructionData.
692   IRInstructionMapper Mapper;
693 
694   /// The SimilarityGroups found with the most recent run of \ref
695   /// findSimilarity. None if there is no recent run.
696   Optional<SimilarityGroupList> SimilarityCandidates;
697 };
698 
699 } // end namespace IRSimilarity
700 
701 /// An analysis pass based on legacy pass manager that runs and returns
702 /// IRSimilarityIdentifier run on the Module.
703 class IRSimilarityIdentifierWrapperPass : public ModulePass {
704   std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
705 
706 public:
707   static char ID;
708   IRSimilarityIdentifierWrapperPass();
709 
getIRSI()710   IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
getIRSI()711   const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
712 
713   bool doInitialization(Module &M) override;
714   bool doFinalization(Module &M) override;
715   bool runOnModule(Module &M) override;
getAnalysisUsage(AnalysisUsage & AU)716   void getAnalysisUsage(AnalysisUsage &AU) const override {
717     AU.setPreservesAll();
718   }
719 };
720 
721 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
722 /// Module.
723 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
724 public:
725   typedef IRSimilarity::IRSimilarityIdentifier Result;
726 
727   Result run(Module &M, ModuleAnalysisManager &);
728 
729 private:
730   friend AnalysisInfoMixin<IRSimilarityAnalysis>;
731   static AnalysisKey Key;
732 };
733 
734 /// Printer pass that uses \c IRSimilarityAnalysis.
735 class IRSimilarityAnalysisPrinterPass
736     : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
737   raw_ostream &OS;
738 
739 public:
IRSimilarityAnalysisPrinterPass(raw_ostream & OS)740   explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
741   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
742 };
743 
744 } // end namespace llvm
745 
746 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
747