1 //===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Implementation file for the IRSimilarityIdentifier for identifying
11 // similarities in IR including the IRInstructionMapper.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Analysis/IRSimilarityIdentifier.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/IR/Intrinsics.h"
18 #include "llvm/IR/User.h"
19 #include "llvm/InitializePasses.h"
20 #include "llvm/Support/SuffixTree.h"
21 
22 using namespace llvm;
23 using namespace IRSimilarity;
24 
IRInstructionData(Instruction & I,bool Legality,IRInstructionDataList & IDList)25 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
26                                      IRInstructionDataList &IDList)
27     : Inst(&I), Legal(Legality), IDL(&IDList) {
28   // Here we collect the operands to be used to determine whether two
29   // instructions are similar to one another.
30   for (Use &OI : I.operands())
31     OperVals.push_back(OI.get());
32 }
33 
isClose(const IRInstructionData & A,const IRInstructionData & B)34 bool IRSimilarity::isClose(const IRInstructionData &A,
35                            const IRInstructionData &B) {
36   return A.Legal && A.Inst->isSameOperationAs(B.Inst);
37 }
38 
39 // TODO: This is the same as the MachineOutliner, and should be consolidated
40 // into the same interface.
convertToUnsignedVec(BasicBlock & BB,std::vector<IRInstructionData * > & InstrList,std::vector<unsigned> & IntegerMapping)41 void IRInstructionMapper::convertToUnsignedVec(
42     BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
43     std::vector<unsigned> &IntegerMapping) {
44   BasicBlock::iterator It = BB.begin();
45 
46   std::vector<unsigned> IntegerMappingForBB;
47   std::vector<IRInstructionData *> InstrListForBB;
48 
49   HaveLegalRange = false;
50   CanCombineWithPrevInstr = false;
51   AddedIllegalLastTime = true;
52 
53   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
54     switch (InstClassifier.visit(*It)) {
55     case InstrType::Legal:
56       mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
57       break;
58     case InstrType::Illegal:
59       mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
60       break;
61     case InstrType::Invisible:
62       AddedIllegalLastTime = false;
63       break;
64     }
65   }
66 
67   if (HaveLegalRange) {
68     mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
69     for_each(InstrListForBB,
70              [this](IRInstructionData *ID) { this->IDL->push_back(*ID); });
71     InstrList.insert(InstrList.end(), InstrListForBB.begin(),
72                      InstrListForBB.end());
73     IntegerMapping.insert(IntegerMapping.end(), IntegerMappingForBB.begin(),
74                           IntegerMappingForBB.end());
75   }
76 }
77 
78 // TODO: This is the same as the MachineOutliner, and should be consolidated
79 // into the same interface.
mapToLegalUnsigned(BasicBlock::iterator & It,std::vector<unsigned> & IntegerMappingForBB,std::vector<IRInstructionData * > & InstrListForBB)80 unsigned IRInstructionMapper::mapToLegalUnsigned(
81     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
82     std::vector<IRInstructionData *> &InstrListForBB) {
83   // We added something legal, so we should unset the AddedLegalLastTime
84   // flag.
85   AddedIllegalLastTime = false;
86 
87   // If we have at least two adjacent legal instructions (which may have
88   // invisible instructions in between), remember that.
89   if (CanCombineWithPrevInstr)
90     HaveLegalRange = true;
91   CanCombineWithPrevInstr = true;
92 
93   // Get the integer for this instruction or give it the current
94   // LegalInstrNumber.
95   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
96   InstrListForBB.push_back(ID);
97 
98   // Add to the instruction list
99   bool WasInserted;
100   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
101       ResultIt;
102   std::tie(ResultIt, WasInserted) =
103       InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
104   unsigned INumber = ResultIt->second;
105 
106   // There was an insertion.
107   if (WasInserted)
108     LegalInstrNumber++;
109 
110   IntegerMappingForBB.push_back(INumber);
111 
112   // Make sure we don't overflow or use any integers reserved by the DenseMap.
113   assert(LegalInstrNumber < IllegalInstrNumber &&
114          "Instruction mapping overflow!");
115 
116   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
117          "Tried to assign DenseMap tombstone or empty key to instruction.");
118   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
119          "Tried to assign DenseMap tombstone or empty key to instruction.");
120 
121   return INumber;
122 }
123 
124 IRInstructionData *
allocateIRInstructionData(Instruction & I,bool Legality,IRInstructionDataList & IDL)125 IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
126                                                IRInstructionDataList &IDL) {
127   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
128 }
129 
130 IRInstructionDataList *
allocateIRInstructionDataList()131 IRInstructionMapper::allocateIRInstructionDataList() {
132   return new (IDLAllocator->Allocate()) IRInstructionDataList();
133 }
134 
135 // TODO: This is the same as the MachineOutliner, and should be consolidated
136 // into the same interface.
mapToIllegalUnsigned(BasicBlock::iterator & It,std::vector<unsigned> & IntegerMappingForBB,std::vector<IRInstructionData * > & InstrListForBB,bool End)137 unsigned IRInstructionMapper::mapToIllegalUnsigned(
138     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
139     std::vector<IRInstructionData *> &InstrListForBB, bool End) {
140   // Can't combine an illegal instruction. Set the flag.
141   CanCombineWithPrevInstr = false;
142 
143   // Only add one illegal number per range of legal numbers.
144   if (AddedIllegalLastTime)
145     return IllegalInstrNumber;
146 
147   IRInstructionData *ID = nullptr;
148   if (!End)
149     ID = allocateIRInstructionData(*It, false, *IDL);
150   InstrListForBB.push_back(ID);
151 
152   // Remember that we added an illegal number last time.
153   AddedIllegalLastTime = true;
154   unsigned INumber = IllegalInstrNumber;
155   IntegerMappingForBB.push_back(IllegalInstrNumber--);
156 
157   assert(LegalInstrNumber < IllegalInstrNumber &&
158          "Instruction mapping overflow!");
159 
160   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
161          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
162 
163   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
164          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
165 
166   return INumber;
167 }
168 
IRSimilarityCandidate(unsigned StartIdx,unsigned Len,IRInstructionData * FirstInstIt,IRInstructionData * LastInstIt)169 IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
170                                              IRInstructionData *FirstInstIt,
171                                              IRInstructionData *LastInstIt)
172     : StartIdx(StartIdx), Len(Len) {
173 
174   assert(FirstInstIt != nullptr && "Instruction is nullptr!");
175   assert(LastInstIt != nullptr && "Instruction is nullptr!");
176   assert(StartIdx + Len > StartIdx &&
177          "Overflow for IRSimilarityCandidate range?");
178   assert(Len - 1 == static_cast<unsigned>(std::distance(
179                         iterator(FirstInstIt), iterator(LastInstIt))) &&
180          "Length of the first and last IRInstructionData do not match the "
181          "given length");
182 
183   // We iterate over the given instructions, and map each unique value
184   // to a unique number in the IRSimilarityCandidate ValueToNumber and
185   // NumberToValue maps.  A constant get its own value globally, the individual
186   // uses of the constants are not considered to be unique.
187   //
188   // IR:                    Mapping Added:
189   // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
190   // %add2 = add i32 %a, %1    %add2 -> 4
191   // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
192   //
193   // when replace with global values, starting from 1, would be
194   //
195   // 3 = add i32 1, 2
196   // 4 = add i32 1, 3
197   // 6 = add i32 5, 2
198   unsigned LocalValNumber = 1;
199   IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
200   for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
201     // Map the operand values to an unsigned integer if it does not already
202     // have an unsigned integer assigned to it.
203     for (Value *Arg : ID->OperVals)
204       if (ValueToNumber.find(Arg) == ValueToNumber.end()) {
205         ValueToNumber.try_emplace(Arg, LocalValNumber);
206         NumberToValue.try_emplace(LocalValNumber, Arg);
207         LocalValNumber++;
208       }
209 
210     // Mapping the instructions to an unsigned integer if it is not already
211     // exist in the mapping.
212     if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) {
213       ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
214       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
215       LocalValNumber++;
216     }
217   }
218 
219   // Setting the first and last instruction data pointers for the candidate.  If
220   // we got through the entire for loop without hitting an assert, we know
221   // that both of these instructions are not nullptrs.
222   FirstInst = FirstInstIt;
223   LastInst = LastInstIt;
224 }
225 
isSimilar(const IRSimilarityCandidate & A,const IRSimilarityCandidate & B)226 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
227                                       const IRSimilarityCandidate &B) {
228   if (A.getLength() != B.getLength())
229     return false;
230 
231   auto InstrDataForBoth =
232       zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
233 
234   return all_of(InstrDataForBoth,
235                 [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
236                   IRInstructionData &A = std::get<0>(R);
237                   IRInstructionData &B = std::get<1>(R);
238                   if (!A.Legal || !B.Legal)
239                     return false;
240                   return isClose(A, B);
241                 });
242 }
243 
244 /// Determine if operand number \p TargetArgVal is in the current mapping set
245 /// for operand number \p SourceArgVal.
246 ///
247 /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
248 /// value numbers from source IRSimilarityCandidate to target
249 /// IRSimilarityCandidate.
250 /// \param [in] SourceArgVal The global value number for an operand in the
251 /// in the original candidate.
252 /// \param [in] TargetArgVal The global value number for the corresponding
253 /// operand in the other candidate.
254 /// \returns True if there exists a mapping and false if not.
checkNumberingAndReplace(DenseMap<unsigned,DenseSet<unsigned>> & CurrentSrcTgtNumberMapping,unsigned SourceArgVal,unsigned TargetArgVal)255 bool checkNumberingAndReplace(
256     DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
257     unsigned SourceArgVal, unsigned TargetArgVal) {
258   // We are given two unsigned integers representing the global values of
259   // the operands in different IRSimilarityCandidates and a current mapping
260   // between the two.
261   //
262   // Source Operand GVN: 1
263   // Target Operand GVN: 2
264   // CurrentMapping: {1: {1, 2}}
265   //
266   // Since we have mapping, and the target operand is contained in the set, we
267   // update it to:
268   // CurrentMapping: {1: {2}}
269   // and can return true. But, if the mapping was
270   // CurrentMapping: {1: {3}}
271   // we would return false.
272 
273   bool WasInserted;
274   DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
275 
276   std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
277       std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
278 
279   // If we created a new mapping, then we are done.
280   if (WasInserted)
281     return true;
282 
283   // If there is more than one option in the mapping set, and the target value
284   // is included in the mapping set replace that set with one that only includes
285   // the target value, as it is the only valid mapping via the non commutative
286   // instruction.
287 
288   DenseSet<unsigned> &TargetSet = Val->second;
289   if (TargetSet.size() > 1 && TargetSet.find(TargetArgVal) != TargetSet.end()) {
290     TargetSet.clear();
291     TargetSet.insert(TargetArgVal);
292     return true;
293   }
294 
295   // Return true if we can find the value in the set.
296   return TargetSet.find(TargetArgVal) != TargetSet.end();
297 }
298 
compareOperandMapping(OperandMapping A,OperandMapping B)299 bool IRSimilarityCandidate::compareOperandMapping(OperandMapping A,
300                                                   OperandMapping B) {
301   // Iterators to keep track of where we are in the operands for each
302   // Instruction.
303   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
304   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
305   unsigned OperandLength = A.OperVals.size();
306 
307   // For each operand, get the value numbering and ensure it is consistent.
308   for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
309     unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
310     unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
311 
312     // Attempt to add a set with only the target value.  If there is no mapping
313     // we can create it here.
314     //
315     // For an instruction like a subtraction:
316     // IRSimilarityCandidateA:  IRSimilarityCandidateB:
317     // %resultA = sub %a, %b    %resultB = sub %d, %e
318     //
319     // We map %a -> %d and %b -> %e.
320     //
321     // And check to see whether their mapping is consistent in
322     // checkNumberingAndReplace.
323 
324     if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
325       return false;
326 
327     if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
328       return false;
329   }
330   return true;
331 }
332 
compareStructure(const IRSimilarityCandidate & A,const IRSimilarityCandidate & B)333 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
334                                              const IRSimilarityCandidate &B) {
335   if (A.getLength() != B.getLength())
336     return false;
337 
338   if (A.ValueToNumber.size() != B.ValueToNumber.size())
339     return false;
340 
341   iterator ItA = A.begin();
342   iterator ItB = B.begin();
343 
344   // These sets create a create a mapping between the values in one candidate
345   // to values in the other candidate.  If we create a set with one element,
346   // and that same element maps to the original element in the candidate
347   // we have a good mapping.
348   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
349   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
350   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
351 
352   bool WasInserted;
353 
354   // Iterate over the instructions contained in each candidate
355   unsigned SectionLength = A.getStartIdx() + A.getLength();
356   for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
357        ItA++, ItB++, Loc++) {
358     // Make sure the instructions are similar to one another.
359     if (!isClose(*ItA, *ItB))
360       return false;
361 
362     Instruction *IA = ItA->Inst;
363     Instruction *IB = ItB->Inst;
364 
365     if (!ItA->Legal || !ItB->Legal)
366       return false;
367 
368     // Get the operand sets for the instructions.
369     ArrayRef<Value *> OperValsA = ItA->OperVals;
370     ArrayRef<Value *> OperValsB = ItB->OperVals;
371 
372     unsigned InstValA = A.ValueToNumber.find(IA)->second;
373     unsigned InstValB = B.ValueToNumber.find(IB)->second;
374 
375     // Ensure that the mappings for the instructions exists.
376     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
377         std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
378     if (!WasInserted && ValueMappingIt->second.find(InstValB) ==
379                             ValueMappingIt->second.end())
380       return false;
381 
382     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert(
383         std::make_pair(InstValB, DenseSet<unsigned>({InstValA})));
384     if (!WasInserted && ValueMappingIt->second.find(InstValA) ==
385                             ValueMappingIt->second.end())
386       return false;
387 
388     // TODO: Handle commutative instructions by mapping one operand to many
389     // operands instead only mapping a single operand to a single operand.
390     if (!compareOperandMapping({A, OperValsA, ValueNumberMappingA},
391                                {B, OperValsB, ValueNumberMappingB}))
392       return false;
393   }
394   return true;
395 }
396 
overlap(const IRSimilarityCandidate & A,const IRSimilarityCandidate & B)397 bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
398                                     const IRSimilarityCandidate &B) {
399   auto DoesOverlap = [](const IRSimilarityCandidate &X,
400                         const IRSimilarityCandidate &Y) {
401     // Check:
402     // XXXXXX        X starts before Y ends
403     //      YYYYYYY  Y starts after X starts
404     return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
405   };
406 
407   return DoesOverlap(A, B) || DoesOverlap(B, A);
408 }
409 
populateMapper(Module & M,std::vector<IRInstructionData * > & InstrList,std::vector<unsigned> & IntegerMapping)410 void IRSimilarityIdentifier::populateMapper(
411     Module &M, std::vector<IRInstructionData *> &InstrList,
412     std::vector<unsigned> &IntegerMapping) {
413 
414   std::vector<IRInstructionData *> InstrListForModule;
415   std::vector<unsigned> IntegerMappingForModule;
416   // Iterate over the functions in the module to map each Instruction in each
417   // BasicBlock to an unsigned integer.
418   for (Function &F : M) {
419 
420     if (F.empty())
421       continue;
422 
423     for (BasicBlock &BB : F) {
424 
425       if (BB.sizeWithoutDebug() < 2)
426         continue;
427 
428       // BB has potential to have similarity since it has a size greater than 2
429       // and can therefore match other regions greater than 2. Map it to a list
430       // of unsigned integers.
431       Mapper.convertToUnsignedVec(BB, InstrListForModule,
432                                   IntegerMappingForModule);
433     }
434   }
435 
436   // Insert the InstrListForModule at the end of the overall InstrList so that
437   // we can have a long InstrList for the entire set of Modules being analyzed.
438   InstrList.insert(InstrList.end(), InstrListForModule.begin(),
439                    InstrListForModule.end());
440   // Do the same as above, but for IntegerMapping.
441   IntegerMapping.insert(IntegerMapping.end(), IntegerMappingForModule.begin(),
442                      IntegerMappingForModule.end());
443 }
444 
populateMapper(ArrayRef<std::unique_ptr<Module>> & Modules,std::vector<IRInstructionData * > & InstrList,std::vector<unsigned> & IntegerMapping)445 void IRSimilarityIdentifier::populateMapper(
446     ArrayRef<std::unique_ptr<Module>> &Modules,
447     std::vector<IRInstructionData *> &InstrList,
448     std::vector<unsigned> &IntegerMapping) {
449 
450   // Iterate over, and map the instructions in each module.
451   for (const std::unique_ptr<Module> &M : Modules)
452     populateMapper(*M, InstrList, IntegerMapping);
453 }
454 
455 /// From a repeated subsequence, find all the different instances of the
456 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
457 /// the IRInstructionData in subsequence.
458 ///
459 /// \param [in] Mapper - The instruction mapper for sanity checks.
460 /// \param [in] InstrList - The vector that holds the instruction data.
461 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
462 /// \param [out] CandsForRepSubstring - The vector to store the generated
463 /// IRSimilarityCandidates.
createCandidatesFromSuffixTree(IRInstructionMapper Mapper,std::vector<IRInstructionData * > & InstrList,std::vector<unsigned> & IntegerMapping,SuffixTree::RepeatedSubstring & RS,std::vector<IRSimilarityCandidate> & CandsForRepSubstring)464 static void createCandidatesFromSuffixTree(
465     IRInstructionMapper Mapper, std::vector<IRInstructionData *> &InstrList,
466     std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
467     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
468 
469   unsigned StringLen = RS.Length;
470 
471   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
472   for (const unsigned &StartIdx : RS.StartIndices) {
473     unsigned EndIdx = StartIdx + StringLen - 1;
474 
475     // Check that this subsequence does not contain an illegal instruction.
476     bool ContainsIllegal = false;
477     for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
478       unsigned Key = IntegerMapping[CurrIdx];
479       if (Key > Mapper.IllegalInstrNumber) {
480         ContainsIllegal = true;
481         break;
482       }
483     }
484 
485     // If we have an illegal instruction, we should not create an
486     // IRSimilarityCandidate for this region.
487     if (ContainsIllegal)
488       continue;
489 
490     // We are getting iterators to the instructions in this region of code
491     // by advancing the start and end indices from the start of the
492     // InstrList.
493     std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
494     std::advance(StartIt, StartIdx);
495     std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
496     std::advance(EndIt, EndIdx);
497 
498     CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
499   }
500 }
501 
502 /// From the list of IRSimilarityCandidates, perform a comparison between each
503 /// IRSimilarityCandidate to determine if there are overlapping
504 /// IRInstructionData, or if they do not have the same structure.
505 ///
506 /// \param [in] CandsForRepSubstring - The vector containing the
507 /// IRSimilarityCandidates.
508 /// \param [out] StructuralGroups - the mapping of unsigned integers to vector
509 /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
510 /// vector are structurally similar to one another.
findCandidateStructures(std::vector<IRSimilarityCandidate> & CandsForRepSubstring,DenseMap<unsigned,SimilarityGroup> & StructuralGroups)511 static void findCandidateStructures(
512     std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
513     DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
514   std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
515       InnerCandEndIt;
516 
517   // IRSimilarityCandidates each have a structure for operand use.  It is
518   // possible that two instances of the same subsequences have different
519   // structure. Each type of structure found is assigned a number.  This
520   // DenseMap maps an IRSimilarityCandidate to which type of similarity
521   // discovered it fits within.
522   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
523 
524   // Find the compatibility from each candidate to the others to determine
525   // which candidates overlap and which have the same structure by mapping
526   // each structure to a different group.
527   bool SameStructure;
528   bool Inserted;
529   unsigned CurrentGroupNum = 0;
530   unsigned OuterGroupNum;
531   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
532   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
533   DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
534 
535   // Iterate over the candidates to determine its structural and overlapping
536   // compatibility with other instructions
537   for (CandIt = CandsForRepSubstring.begin(),
538       CandEndIt = CandsForRepSubstring.end();
539        CandIt != CandEndIt; CandIt++) {
540 
541     // Determine if it has an assigned structural group already.
542     CandToGroupIt = CandToGroup.find(&*CandIt);
543     if (CandToGroupIt == CandToGroup.end()) {
544       // If not, we assign it one, and add it to our mapping.
545       std::tie(CandToGroupIt, Inserted) =
546           CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
547     }
548 
549     // Get the structural group number from the iterator.
550     OuterGroupNum = CandToGroupIt->second;
551 
552     // Check if we already have a list of IRSimilarityCandidates for the current
553     // structural group.  Create one if one does not exist.
554     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
555     if (CurrentGroupPair == StructuralGroups.end())
556       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
557           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
558 
559     // Iterate over the IRSimilarityCandidates following the current
560     // IRSimilarityCandidate in the list to determine whether the two
561     // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
562     // of IRSimilarityCandidates.
563     for (InnerCandIt = std::next(CandIt),
564         InnerCandEndIt = CandsForRepSubstring.end();
565          InnerCandIt != InnerCandEndIt; InnerCandIt++) {
566 
567       // We check if the inner item has a group already, if it does, we skip it.
568       CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
569       if (CandToGroupItInner != CandToGroup.end())
570         continue;
571 
572       // Otherwise we determine if they have the same structure and add it to
573       // vector if they match.
574       SameStructure =
575           IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt);
576       if (!SameStructure)
577         continue;
578 
579       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
580       CurrentGroupPair->second.push_back(*InnerCandIt);
581     }
582   }
583 }
584 
findCandidates(std::vector<IRInstructionData * > & InstrList,std::vector<unsigned> & IntegerMapping)585 void IRSimilarityIdentifier::findCandidates(
586     std::vector<IRInstructionData *> &InstrList,
587     std::vector<unsigned> &IntegerMapping) {
588   SuffixTree ST(IntegerMapping);
589 
590   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
591   std::vector<SimilarityGroup> NewCandidateGroups;
592 
593   DenseMap<unsigned, SimilarityGroup> StructuralGroups;
594 
595   // Iterate over the subsequences found by the Suffix Tree to create
596   // IRSimilarityCandidates for each repeated subsequence and determine which
597   // instances are structurally similar to one another.
598   for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
599     createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It,
600                                    CandsForRepSubstring);
601 
602     if (CandsForRepSubstring.size() < 2)
603       continue;
604 
605     findCandidateStructures(CandsForRepSubstring, StructuralGroups);
606     for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
607       // We only add the group if it contains more than one
608       // IRSimilarityCandidate.  If there is only one, that means there is no
609       // other repeated subsequence with the same structure.
610       if (Group.second.size() > 1)
611         SimilarityCandidates->push_back(Group.second);
612 
613     CandsForRepSubstring.clear();
614     StructuralGroups.clear();
615     NewCandidateGroups.clear();
616   }
617 }
618 
findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules)619 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
620     ArrayRef<std::unique_ptr<Module>> Modules) {
621   resetSimilarityCandidates();
622 
623   std::vector<IRInstructionData *> InstrList;
624   std::vector<unsigned> IntegerMapping;
625 
626   populateMapper(Modules, InstrList, IntegerMapping);
627   findCandidates(InstrList, IntegerMapping);
628 
629   return SimilarityCandidates.getValue();
630 }
631 
findSimilarity(Module & M)632 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
633   resetSimilarityCandidates();
634 
635   std::vector<IRInstructionData *> InstrList;
636   std::vector<unsigned> IntegerMapping;
637 
638   populateMapper(M, InstrList, IntegerMapping);
639   findCandidates(InstrList, IntegerMapping);
640 
641   return SimilarityCandidates.getValue();
642 }
643 
644 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
645                 "ir-similarity-identifier", false, true)
646 
IRSimilarityIdentifierWrapperPass()647 IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
648     : ModulePass(ID) {
649   initializeIRSimilarityIdentifierWrapperPassPass(
650       *PassRegistry::getPassRegistry());
651 }
652 
doInitialization(Module & M)653 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
654   IRSI.reset(new IRSimilarityIdentifier(M));
655   return false;
656 }
657 
doFinalization(Module & M)658 bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
659   IRSI.reset();
660   return false;
661 }
662 
runOnModule(Module & M)663 bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
664   // All the real work is done in the constructor for the pass.
665   IRSI.reset(new IRSimilarityIdentifier(M));
666   return false;
667 }
668 
669 AnalysisKey IRSimilarityAnalysis::Key;
run(Module & M,ModuleAnalysisManager &)670 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
671                                                ModuleAnalysisManager &) {
672 
673   return IRSimilarityIdentifier(M);
674 }
675 
676 PreservedAnalyses
run(Module & M,ModuleAnalysisManager & AM)677 IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
678   IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
679   Optional<SimilarityGroupList> &SimilarityCandidatesOpt = IRSI.getSimilarity();
680 
681   for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
682     OS << CandVec.size() << " candidates of length "
683        << CandVec.begin()->getLength() << ".  Found in: \n";
684     for (IRSimilarityCandidate &Cand : CandVec) {
685       OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
686          << ",  Basic Block: ";
687       if (Cand.front()->Inst->getParent()->getName().str() == "")
688         OS << "(unnamed)\n";
689       else
690         OS << Cand.front()->Inst->getParent()->getName().str() << "\n";
691     }
692   }
693 
694   return PreservedAnalyses::all();
695 }
696 
697 char IRSimilarityIdentifierWrapperPass::ID = 0;
698