1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize.h"
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/SetVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/AssumptionCache.h"
26 #include "llvm/Analysis/CodeMetrics.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/ScalarEvolution.h"
29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/Analysis/ValueTracking.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/NoFolder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Transforms/Utils/VectorUtils.h"
47 #include <algorithm>
48 #include <map>
49 #include <memory>
50
51 using namespace llvm;
52
53 #define SV_NAME "slp-vectorizer"
54 #define DEBUG_TYPE "SLP"
55
56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
57
58 static cl::opt<int>
59 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
60 cl::desc("Only vectorize if you gain more than this "
61 "number "));
62
63 static cl::opt<bool>
64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
65 cl::desc("Attempt to vectorize horizontal reductions"));
66
67 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
68 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
69 cl::desc(
70 "Attempt to vectorize horizontal reductions feeding into a store"));
71
72 namespace {
73
74 static const unsigned MinVecRegSize = 128;
75
76 static const unsigned RecursionMaxDepth = 12;
77
78 // Limit the number of alias checks. The limit is chosen so that
79 // it has no negative effect on the llvm benchmarks.
80 static const unsigned AliasedCheckLimit = 10;
81
82 // Another limit for the alias checks: The maximum distance between load/store
83 // instructions where alias checks are done.
84 // This limit is useful for very large basic blocks.
85 static const unsigned MaxMemDepDistance = 160;
86
87 /// \brief Predicate for the element types that the SLP vectorizer supports.
88 ///
89 /// The most important thing to filter here are types which are invalid in LLVM
90 /// vectors. We also filter target specific types which have absolutely no
91 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
92 /// avoids spending time checking the cost model and realizing that they will
93 /// be inevitably scalarized.
isValidElementType(Type * Ty)94 static bool isValidElementType(Type *Ty) {
95 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
96 !Ty->isPPC_FP128Ty();
97 }
98
99 /// \returns the parent basic block if all of the instructions in \p VL
100 /// are in the same block or null otherwise.
getSameBlock(ArrayRef<Value * > VL)101 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
102 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
103 if (!I0)
104 return nullptr;
105 BasicBlock *BB = I0->getParent();
106 for (int i = 1, e = VL.size(); i < e; i++) {
107 Instruction *I = dyn_cast<Instruction>(VL[i]);
108 if (!I)
109 return nullptr;
110
111 if (BB != I->getParent())
112 return nullptr;
113 }
114 return BB;
115 }
116
117 /// \returns True if all of the values in \p VL are constants.
allConstant(ArrayRef<Value * > VL)118 static bool allConstant(ArrayRef<Value *> VL) {
119 for (unsigned i = 0, e = VL.size(); i < e; ++i)
120 if (!isa<Constant>(VL[i]))
121 return false;
122 return true;
123 }
124
125 /// \returns True if all of the values in \p VL are identical.
isSplat(ArrayRef<Value * > VL)126 static bool isSplat(ArrayRef<Value *> VL) {
127 for (unsigned i = 1, e = VL.size(); i < e; ++i)
128 if (VL[i] != VL[0])
129 return false;
130 return true;
131 }
132
133 ///\returns Opcode that can be clubbed with \p Op to create an alternate
134 /// sequence which can later be merged as a ShuffleVector instruction.
getAltOpcode(unsigned Op)135 static unsigned getAltOpcode(unsigned Op) {
136 switch (Op) {
137 case Instruction::FAdd:
138 return Instruction::FSub;
139 case Instruction::FSub:
140 return Instruction::FAdd;
141 case Instruction::Add:
142 return Instruction::Sub;
143 case Instruction::Sub:
144 return Instruction::Add;
145 default:
146 return 0;
147 }
148 }
149
150 ///\returns bool representing if Opcode \p Op can be part
151 /// of an alternate sequence which can later be merged as
152 /// a ShuffleVector instruction.
canCombineAsAltInst(unsigned Op)153 static bool canCombineAsAltInst(unsigned Op) {
154 if (Op == Instruction::FAdd || Op == Instruction::FSub ||
155 Op == Instruction::Sub || Op == Instruction::Add)
156 return true;
157 return false;
158 }
159
160 /// \returns ShuffleVector instruction if intructions in \p VL have
161 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
162 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
isAltInst(ArrayRef<Value * > VL)163 static unsigned isAltInst(ArrayRef<Value *> VL) {
164 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
165 unsigned Opcode = I0->getOpcode();
166 unsigned AltOpcode = getAltOpcode(Opcode);
167 for (int i = 1, e = VL.size(); i < e; i++) {
168 Instruction *I = dyn_cast<Instruction>(VL[i]);
169 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
170 return 0;
171 }
172 return Instruction::ShuffleVector;
173 }
174
175 /// \returns The opcode if all of the Instructions in \p VL have the same
176 /// opcode, or zero.
getSameOpcode(ArrayRef<Value * > VL)177 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
178 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
179 if (!I0)
180 return 0;
181 unsigned Opcode = I0->getOpcode();
182 for (int i = 1, e = VL.size(); i < e; i++) {
183 Instruction *I = dyn_cast<Instruction>(VL[i]);
184 if (!I || Opcode != I->getOpcode()) {
185 if (canCombineAsAltInst(Opcode) && i == 1)
186 return isAltInst(VL);
187 return 0;
188 }
189 }
190 return Opcode;
191 }
192
193 /// Get the intersection (logical and) of all of the potential IR flags
194 /// of each scalar operation (VL) that will be converted into a vector (I).
195 /// Flag set: NSW, NUW, exact, and all of fast-math.
propagateIRFlags(Value * I,ArrayRef<Value * > VL)196 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
197 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
198 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
199 // Intersection is initialized to the 0th scalar,
200 // so start counting from index '1'.
201 for (int i = 1, e = VL.size(); i < e; ++i) {
202 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
203 Intersection->andIRFlags(Scalar);
204 }
205 VecOp->copyIRFlags(Intersection);
206 }
207 }
208 }
209
210 /// \returns \p I after propagating metadata from \p VL.
propagateMetadata(Instruction * I,ArrayRef<Value * > VL)211 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
212 Instruction *I0 = cast<Instruction>(VL[0]);
213 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
214 I0->getAllMetadataOtherThanDebugLoc(Metadata);
215
216 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
217 unsigned Kind = Metadata[i].first;
218 MDNode *MD = Metadata[i].second;
219
220 for (int i = 1, e = VL.size(); MD && i != e; i++) {
221 Instruction *I = cast<Instruction>(VL[i]);
222 MDNode *IMD = I->getMetadata(Kind);
223
224 switch (Kind) {
225 default:
226 MD = nullptr; // Remove unknown metadata
227 break;
228 case LLVMContext::MD_tbaa:
229 MD = MDNode::getMostGenericTBAA(MD, IMD);
230 break;
231 case LLVMContext::MD_alias_scope:
232 MD = MDNode::getMostGenericAliasScope(MD, IMD);
233 break;
234 case LLVMContext::MD_noalias:
235 MD = MDNode::intersect(MD, IMD);
236 break;
237 case LLVMContext::MD_fpmath:
238 MD = MDNode::getMostGenericFPMath(MD, IMD);
239 break;
240 }
241 }
242 I->setMetadata(Kind, MD);
243 }
244 return I;
245 }
246
247 /// \returns The type that all of the values in \p VL have or null if there
248 /// are different types.
getSameType(ArrayRef<Value * > VL)249 static Type* getSameType(ArrayRef<Value *> VL) {
250 Type *Ty = VL[0]->getType();
251 for (int i = 1, e = VL.size(); i < e; i++)
252 if (VL[i]->getType() != Ty)
253 return nullptr;
254
255 return Ty;
256 }
257
258 /// \returns True if the ExtractElement instructions in VL can be vectorized
259 /// to use the original vector.
CanReuseExtract(ArrayRef<Value * > VL)260 static bool CanReuseExtract(ArrayRef<Value *> VL) {
261 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
262 // Check if all of the extracts come from the same vector and from the
263 // correct offset.
264 Value *VL0 = VL[0];
265 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
266 Value *Vec = E0->getOperand(0);
267
268 // We have to extract from the same vector type.
269 unsigned NElts = Vec->getType()->getVectorNumElements();
270
271 if (NElts != VL.size())
272 return false;
273
274 // Check that all of the indices extract from the correct offset.
275 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
276 if (!CI || CI->getZExtValue())
277 return false;
278
279 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
280 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
281 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
282
283 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
284 return false;
285 }
286
287 return true;
288 }
289
290 /// \returns True if in-tree use also needs extract. This refers to
291 /// possible scalar operand in vectorized instruction.
InTreeUserNeedToExtract(Value * Scalar,Instruction * UserInst,TargetLibraryInfo * TLI)292 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
293 TargetLibraryInfo *TLI) {
294
295 unsigned Opcode = UserInst->getOpcode();
296 switch (Opcode) {
297 case Instruction::Load: {
298 LoadInst *LI = cast<LoadInst>(UserInst);
299 return (LI->getPointerOperand() == Scalar);
300 }
301 case Instruction::Store: {
302 StoreInst *SI = cast<StoreInst>(UserInst);
303 return (SI->getPointerOperand() == Scalar);
304 }
305 case Instruction::Call: {
306 CallInst *CI = cast<CallInst>(UserInst);
307 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
308 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
309 return (CI->getArgOperand(1) == Scalar);
310 }
311 }
312 default:
313 return false;
314 }
315 }
316
317 /// \returns the AA location that is being access by the instruction.
getLocation(Instruction * I,AliasAnalysis * AA)318 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
319 if (StoreInst *SI = dyn_cast<StoreInst>(I))
320 return AA->getLocation(SI);
321 if (LoadInst *LI = dyn_cast<LoadInst>(I))
322 return AA->getLocation(LI);
323 return AliasAnalysis::Location();
324 }
325
326 /// \returns True if the instruction is not a volatile or atomic load/store.
isSimple(Instruction * I)327 static bool isSimple(Instruction *I) {
328 if (LoadInst *LI = dyn_cast<LoadInst>(I))
329 return LI->isSimple();
330 if (StoreInst *SI = dyn_cast<StoreInst>(I))
331 return SI->isSimple();
332 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
333 return !MI->isVolatile();
334 return true;
335 }
336
337 /// Bottom Up SLP Vectorizer.
338 class BoUpSLP {
339 public:
340 typedef SmallVector<Value *, 8> ValueList;
341 typedef SmallVector<Instruction *, 16> InstrList;
342 typedef SmallPtrSet<Value *, 16> ValueSet;
343 typedef SmallVector<StoreInst *, 8> StoreList;
344
BoUpSLP(Function * Func,ScalarEvolution * Se,TargetTransformInfo * Tti,TargetLibraryInfo * TLi,AliasAnalysis * Aa,LoopInfo * Li,DominatorTree * Dt,AssumptionCache * AC)345 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
346 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
347 DominatorTree *Dt, AssumptionCache *AC)
348 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
349 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
350 Builder(Se->getContext()) {
351 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
352 }
353
354 /// \brief Vectorize the tree that starts with the elements in \p VL.
355 /// Returns the vectorized root.
356 Value *vectorizeTree();
357
358 /// \returns the cost incurred by unwanted spills and fills, caused by
359 /// holding live values over call sites.
360 int getSpillCost();
361
362 /// \returns the vectorization cost of the subtree that starts at \p VL.
363 /// A negative number means that this is profitable.
364 int getTreeCost();
365
366 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
367 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
368 void buildTree(ArrayRef<Value *> Roots,
369 ArrayRef<Value *> UserIgnoreLst = None);
370
371 /// Clear the internal data structures that are created by 'buildTree'.
deleteTree()372 void deleteTree() {
373 VectorizableTree.clear();
374 ScalarToTreeEntry.clear();
375 MustGather.clear();
376 ExternalUses.clear();
377 NumLoadsWantToKeepOrder = 0;
378 NumLoadsWantToChangeOrder = 0;
379 for (auto &Iter : BlocksSchedules) {
380 BlockScheduling *BS = Iter.second.get();
381 BS->clear();
382 }
383 }
384
385 /// \returns true if the memory operations A and B are consecutive.
386 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
387
388 /// \brief Perform LICM and CSE on the newly generated gather sequences.
389 void optimizeGatherSequence();
390
391 /// \returns true if it is benefitial to reverse the vector order.
shouldReorder() const392 bool shouldReorder() const {
393 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
394 }
395
396 private:
397 struct TreeEntry;
398
399 /// \returns the cost of the vectorizable entry.
400 int getEntryCost(TreeEntry *E);
401
402 /// This is the recursive part of buildTree.
403 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
404
405 /// Vectorize a single entry in the tree.
406 Value *vectorizeTree(TreeEntry *E);
407
408 /// Vectorize a single entry in the tree, starting in \p VL.
409 Value *vectorizeTree(ArrayRef<Value *> VL);
410
411 /// \returns the pointer to the vectorized value if \p VL is already
412 /// vectorized, or NULL. They may happen in cycles.
413 Value *alreadyVectorized(ArrayRef<Value *> VL) const;
414
415 /// \brief Take the pointer operand from the Load/Store instruction.
416 /// \returns NULL if this is not a valid Load/Store instruction.
417 static Value *getPointerOperand(Value *I);
418
419 /// \brief Take the address space operand from the Load/Store instruction.
420 /// \returns -1 if this is not a valid Load/Store instruction.
421 static unsigned getAddressSpaceOperand(Value *I);
422
423 /// \returns the scalarization cost for this type. Scalarization in this
424 /// context means the creation of vectors from a group of scalars.
425 int getGatherCost(Type *Ty);
426
427 /// \returns the scalarization cost for this list of values. Assuming that
428 /// this subtree gets vectorized, we may need to extract the values from the
429 /// roots. This method calculates the cost of extracting the values.
430 int getGatherCost(ArrayRef<Value *> VL);
431
432 /// \brief Set the Builder insert point to one after the last instruction in
433 /// the bundle
434 void setInsertPointAfterBundle(ArrayRef<Value *> VL);
435
436 /// \returns a vector from a collection of scalars in \p VL.
437 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
438
439 /// \returns whether the VectorizableTree is fully vectoriable and will
440 /// be beneficial even the tree height is tiny.
441 bool isFullyVectorizableTinyTree();
442
443 /// \reorder commutative operands in alt shuffle if they result in
444 /// vectorized code.
445 void reorderAltShuffleOperands(ArrayRef<Value *> VL,
446 SmallVectorImpl<Value *> &Left,
447 SmallVectorImpl<Value *> &Right);
448 /// \reorder commutative operands to get better probability of
449 /// generating vectorized code.
450 void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
451 SmallVectorImpl<Value *> &Left,
452 SmallVectorImpl<Value *> &Right);
453 struct TreeEntry {
TreeEntry__anonf9942ad60111::BoUpSLP::TreeEntry454 TreeEntry() : Scalars(), VectorizedValue(nullptr),
455 NeedToGather(0) {}
456
457 /// \returns true if the scalars in VL are equal to this entry.
isSame__anonf9942ad60111::BoUpSLP::TreeEntry458 bool isSame(ArrayRef<Value *> VL) const {
459 assert(VL.size() == Scalars.size() && "Invalid size");
460 return std::equal(VL.begin(), VL.end(), Scalars.begin());
461 }
462
463 /// A vector of scalars.
464 ValueList Scalars;
465
466 /// The Scalars are vectorized into this value. It is initialized to Null.
467 Value *VectorizedValue;
468
469 /// Do we need to gather this sequence ?
470 bool NeedToGather;
471 };
472
473 /// Create a new VectorizableTree entry.
newTreeEntry(ArrayRef<Value * > VL,bool Vectorized)474 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
475 VectorizableTree.push_back(TreeEntry());
476 int idx = VectorizableTree.size() - 1;
477 TreeEntry *Last = &VectorizableTree[idx];
478 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
479 Last->NeedToGather = !Vectorized;
480 if (Vectorized) {
481 for (int i = 0, e = VL.size(); i != e; ++i) {
482 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
483 ScalarToTreeEntry[VL[i]] = idx;
484 }
485 } else {
486 MustGather.insert(VL.begin(), VL.end());
487 }
488 return Last;
489 }
490
491 /// -- Vectorization State --
492 /// Holds all of the tree entries.
493 std::vector<TreeEntry> VectorizableTree;
494
495 /// Maps a specific scalar to its tree entry.
496 SmallDenseMap<Value*, int> ScalarToTreeEntry;
497
498 /// A list of scalars that we found that we need to keep as scalars.
499 ValueSet MustGather;
500
501 /// This POD struct describes one external user in the vectorized tree.
502 struct ExternalUser {
ExternalUser__anonf9942ad60111::BoUpSLP::ExternalUser503 ExternalUser (Value *S, llvm::User *U, int L) :
504 Scalar(S), User(U), Lane(L){};
505 // Which scalar in our function.
506 Value *Scalar;
507 // Which user that uses the scalar.
508 llvm::User *User;
509 // Which lane does the scalar belong to.
510 int Lane;
511 };
512 typedef SmallVector<ExternalUser, 16> UserList;
513
514 /// Checks if two instructions may access the same memory.
515 ///
516 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
517 /// is invariant in the calling loop.
isAliased(const AliasAnalysis::Location & Loc1,Instruction * Inst1,Instruction * Inst2)518 bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
519 Instruction *Inst2) {
520
521 // First check if the result is already in the cache.
522 AliasCacheKey key = std::make_pair(Inst1, Inst2);
523 Optional<bool> &result = AliasCache[key];
524 if (result.hasValue()) {
525 return result.getValue();
526 }
527 AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
528 bool aliased = true;
529 if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
530 // Do the alias check.
531 aliased = AA->alias(Loc1, Loc2);
532 }
533 // Store the result in the cache.
534 result = aliased;
535 return aliased;
536 }
537
538 typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
539
540 /// Cache for alias results.
541 /// TODO: consider moving this to the AliasAnalysis itself.
542 DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
543
544 /// Removes an instruction from its block and eventually deletes it.
545 /// It's like Instruction::eraseFromParent() except that the actual deletion
546 /// is delayed until BoUpSLP is destructed.
547 /// This is required to ensure that there are no incorrect collisions in the
548 /// AliasCache, which can happen if a new instruction is allocated at the
549 /// same address as a previously deleted instruction.
eraseInstruction(Instruction * I)550 void eraseInstruction(Instruction *I) {
551 I->removeFromParent();
552 I->dropAllReferences();
553 DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
554 }
555
556 /// Temporary store for deleted instructions. Instructions will be deleted
557 /// eventually when the BoUpSLP is destructed.
558 SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
559
560 /// A list of values that need to extracted out of the tree.
561 /// This list holds pairs of (Internal Scalar : External User).
562 UserList ExternalUses;
563
564 /// Values used only by @llvm.assume calls.
565 SmallPtrSet<const Value *, 32> EphValues;
566
567 /// Holds all of the instructions that we gathered.
568 SetVector<Instruction *> GatherSeq;
569 /// A list of blocks that we are going to CSE.
570 SetVector<BasicBlock *> CSEBlocks;
571
572 /// Contains all scheduling relevant data for an instruction.
573 /// A ScheduleData either represents a single instruction or a member of an
574 /// instruction bundle (= a group of instructions which is combined into a
575 /// vector instruction).
576 struct ScheduleData {
577
578 // The initial value for the dependency counters. It means that the
579 // dependencies are not calculated yet.
580 enum { InvalidDeps = -1 };
581
ScheduleData__anonf9942ad60111::BoUpSLP::ScheduleData582 ScheduleData()
583 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
584 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
585 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
586 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
587
init__anonf9942ad60111::BoUpSLP::ScheduleData588 void init(int BlockSchedulingRegionID) {
589 FirstInBundle = this;
590 NextInBundle = nullptr;
591 NextLoadStore = nullptr;
592 IsScheduled = false;
593 SchedulingRegionID = BlockSchedulingRegionID;
594 UnscheduledDepsInBundle = UnscheduledDeps;
595 clearDependencies();
596 }
597
598 /// Returns true if the dependency information has been calculated.
hasValidDependencies__anonf9942ad60111::BoUpSLP::ScheduleData599 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
600
601 /// Returns true for single instructions and for bundle representatives
602 /// (= the head of a bundle).
isSchedulingEntity__anonf9942ad60111::BoUpSLP::ScheduleData603 bool isSchedulingEntity() const { return FirstInBundle == this; }
604
605 /// Returns true if it represents an instruction bundle and not only a
606 /// single instruction.
isPartOfBundle__anonf9942ad60111::BoUpSLP::ScheduleData607 bool isPartOfBundle() const {
608 return NextInBundle != nullptr || FirstInBundle != this;
609 }
610
611 /// Returns true if it is ready for scheduling, i.e. it has no more
612 /// unscheduled depending instructions/bundles.
isReady__anonf9942ad60111::BoUpSLP::ScheduleData613 bool isReady() const {
614 assert(isSchedulingEntity() &&
615 "can't consider non-scheduling entity for ready list");
616 return UnscheduledDepsInBundle == 0 && !IsScheduled;
617 }
618
619 /// Modifies the number of unscheduled dependencies, also updating it for
620 /// the whole bundle.
incrementUnscheduledDeps__anonf9942ad60111::BoUpSLP::ScheduleData621 int incrementUnscheduledDeps(int Incr) {
622 UnscheduledDeps += Incr;
623 return FirstInBundle->UnscheduledDepsInBundle += Incr;
624 }
625
626 /// Sets the number of unscheduled dependencies to the number of
627 /// dependencies.
resetUnscheduledDeps__anonf9942ad60111::BoUpSLP::ScheduleData628 void resetUnscheduledDeps() {
629 incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
630 }
631
632 /// Clears all dependency information.
clearDependencies__anonf9942ad60111::BoUpSLP::ScheduleData633 void clearDependencies() {
634 Dependencies = InvalidDeps;
635 resetUnscheduledDeps();
636 MemoryDependencies.clear();
637 }
638
dump__anonf9942ad60111::BoUpSLP::ScheduleData639 void dump(raw_ostream &os) const {
640 if (!isSchedulingEntity()) {
641 os << "/ " << *Inst;
642 } else if (NextInBundle) {
643 os << '[' << *Inst;
644 ScheduleData *SD = NextInBundle;
645 while (SD) {
646 os << ';' << *SD->Inst;
647 SD = SD->NextInBundle;
648 }
649 os << ']';
650 } else {
651 os << *Inst;
652 }
653 }
654
655 Instruction *Inst;
656
657 /// Points to the head in an instruction bundle (and always to this for
658 /// single instructions).
659 ScheduleData *FirstInBundle;
660
661 /// Single linked list of all instructions in a bundle. Null if it is a
662 /// single instruction.
663 ScheduleData *NextInBundle;
664
665 /// Single linked list of all memory instructions (e.g. load, store, call)
666 /// in the block - until the end of the scheduling region.
667 ScheduleData *NextLoadStore;
668
669 /// The dependent memory instructions.
670 /// This list is derived on demand in calculateDependencies().
671 SmallVector<ScheduleData *, 4> MemoryDependencies;
672
673 /// This ScheduleData is in the current scheduling region if this matches
674 /// the current SchedulingRegionID of BlockScheduling.
675 int SchedulingRegionID;
676
677 /// Used for getting a "good" final ordering of instructions.
678 int SchedulingPriority;
679
680 /// The number of dependencies. Constitutes of the number of users of the
681 /// instruction plus the number of dependent memory instructions (if any).
682 /// This value is calculated on demand.
683 /// If InvalidDeps, the number of dependencies is not calculated yet.
684 ///
685 int Dependencies;
686
687 /// The number of dependencies minus the number of dependencies of scheduled
688 /// instructions. As soon as this is zero, the instruction/bundle gets ready
689 /// for scheduling.
690 /// Note that this is negative as long as Dependencies is not calculated.
691 int UnscheduledDeps;
692
693 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
694 /// single instructions.
695 int UnscheduledDepsInBundle;
696
697 /// True if this instruction is scheduled (or considered as scheduled in the
698 /// dry-run).
699 bool IsScheduled;
700 };
701
702 #ifndef NDEBUG
703 friend raw_ostream &operator<<(raw_ostream &os,
704 const BoUpSLP::ScheduleData &SD);
705 #endif
706
707 /// Contains all scheduling data for a basic block.
708 ///
709 struct BlockScheduling {
710
BlockScheduling__anonf9942ad60111::BoUpSLP::BlockScheduling711 BlockScheduling(BasicBlock *BB)
712 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
713 ScheduleStart(nullptr), ScheduleEnd(nullptr),
714 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
715 // Make sure that the initial SchedulingRegionID is greater than the
716 // initial SchedulingRegionID in ScheduleData (which is 0).
717 SchedulingRegionID(1) {}
718
clear__anonf9942ad60111::BoUpSLP::BlockScheduling719 void clear() {
720 ReadyInsts.clear();
721 ScheduleStart = nullptr;
722 ScheduleEnd = nullptr;
723 FirstLoadStoreInRegion = nullptr;
724 LastLoadStoreInRegion = nullptr;
725
726 // Make a new scheduling region, i.e. all existing ScheduleData is not
727 // in the new region yet.
728 ++SchedulingRegionID;
729 }
730
getScheduleData__anonf9942ad60111::BoUpSLP::BlockScheduling731 ScheduleData *getScheduleData(Value *V) {
732 ScheduleData *SD = ScheduleDataMap[V];
733 if (SD && SD->SchedulingRegionID == SchedulingRegionID)
734 return SD;
735 return nullptr;
736 }
737
isInSchedulingRegion__anonf9942ad60111::BoUpSLP::BlockScheduling738 bool isInSchedulingRegion(ScheduleData *SD) {
739 return SD->SchedulingRegionID == SchedulingRegionID;
740 }
741
742 /// Marks an instruction as scheduled and puts all dependent ready
743 /// instructions into the ready-list.
744 template <typename ReadyListType>
schedule__anonf9942ad60111::BoUpSLP::BlockScheduling745 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
746 SD->IsScheduled = true;
747 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
748
749 ScheduleData *BundleMember = SD;
750 while (BundleMember) {
751 // Handle the def-use chain dependencies.
752 for (Use &U : BundleMember->Inst->operands()) {
753 ScheduleData *OpDef = getScheduleData(U.get());
754 if (OpDef && OpDef->hasValidDependencies() &&
755 OpDef->incrementUnscheduledDeps(-1) == 0) {
756 // There are no more unscheduled dependencies after decrementing,
757 // so we can put the dependent instruction into the ready list.
758 ScheduleData *DepBundle = OpDef->FirstInBundle;
759 assert(!DepBundle->IsScheduled &&
760 "already scheduled bundle gets ready");
761 ReadyList.insert(DepBundle);
762 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
763 }
764 }
765 // Handle the memory dependencies.
766 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
767 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
768 // There are no more unscheduled dependencies after decrementing,
769 // so we can put the dependent instruction into the ready list.
770 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
771 assert(!DepBundle->IsScheduled &&
772 "already scheduled bundle gets ready");
773 ReadyList.insert(DepBundle);
774 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
775 }
776 }
777 BundleMember = BundleMember->NextInBundle;
778 }
779 }
780
781 /// Put all instructions into the ReadyList which are ready for scheduling.
782 template <typename ReadyListType>
initialFillReadyList__anonf9942ad60111::BoUpSLP::BlockScheduling783 void initialFillReadyList(ReadyListType &ReadyList) {
784 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
785 ScheduleData *SD = getScheduleData(I);
786 if (SD->isSchedulingEntity() && SD->isReady()) {
787 ReadyList.insert(SD);
788 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
789 }
790 }
791 }
792
793 /// Checks if a bundle of instructions can be scheduled, i.e. has no
794 /// cyclic dependencies. This is only a dry-run, no instructions are
795 /// actually moved at this stage.
796 bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
797
798 /// Un-bundles a group of instructions.
799 void cancelScheduling(ArrayRef<Value *> VL);
800
801 /// Extends the scheduling region so that V is inside the region.
802 void extendSchedulingRegion(Value *V);
803
804 /// Initialize the ScheduleData structures for new instructions in the
805 /// scheduling region.
806 void initScheduleData(Instruction *FromI, Instruction *ToI,
807 ScheduleData *PrevLoadStore,
808 ScheduleData *NextLoadStore);
809
810 /// Updates the dependency information of a bundle and of all instructions/
811 /// bundles which depend on the original bundle.
812 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
813 BoUpSLP *SLP);
814
815 /// Sets all instruction in the scheduling region to un-scheduled.
816 void resetSchedule();
817
818 BasicBlock *BB;
819
820 /// Simple memory allocation for ScheduleData.
821 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
822
823 /// The size of a ScheduleData array in ScheduleDataChunks.
824 int ChunkSize;
825
826 /// The allocator position in the current chunk, which is the last entry
827 /// of ScheduleDataChunks.
828 int ChunkPos;
829
830 /// Attaches ScheduleData to Instruction.
831 /// Note that the mapping survives during all vectorization iterations, i.e.
832 /// ScheduleData structures are recycled.
833 DenseMap<Value *, ScheduleData *> ScheduleDataMap;
834
835 struct ReadyList : SmallVector<ScheduleData *, 8> {
insert__anonf9942ad60111::BoUpSLP::BlockScheduling::ReadyList836 void insert(ScheduleData *SD) { push_back(SD); }
837 };
838
839 /// The ready-list for scheduling (only used for the dry-run).
840 ReadyList ReadyInsts;
841
842 /// The first instruction of the scheduling region.
843 Instruction *ScheduleStart;
844
845 /// The first instruction _after_ the scheduling region.
846 Instruction *ScheduleEnd;
847
848 /// The first memory accessing instruction in the scheduling region
849 /// (can be null).
850 ScheduleData *FirstLoadStoreInRegion;
851
852 /// The last memory accessing instruction in the scheduling region
853 /// (can be null).
854 ScheduleData *LastLoadStoreInRegion;
855
856 /// The ID of the scheduling region. For a new vectorization iteration this
857 /// is incremented which "removes" all ScheduleData from the region.
858 int SchedulingRegionID;
859 };
860
861 /// Attaches the BlockScheduling structures to basic blocks.
862 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
863
864 /// Performs the "real" scheduling. Done before vectorization is actually
865 /// performed in a basic block.
866 void scheduleBlock(BlockScheduling *BS);
867
868 /// List of users to ignore during scheduling and that don't need extracting.
869 ArrayRef<Value *> UserIgnoreList;
870
871 // Number of load-bundles, which contain consecutive loads.
872 int NumLoadsWantToKeepOrder;
873
874 // Number of load-bundles of size 2, which are consecutive loads if reversed.
875 int NumLoadsWantToChangeOrder;
876
877 // Analysis and block reference.
878 Function *F;
879 ScalarEvolution *SE;
880 TargetTransformInfo *TTI;
881 TargetLibraryInfo *TLI;
882 AliasAnalysis *AA;
883 LoopInfo *LI;
884 DominatorTree *DT;
885 /// Instruction builder to construct the vectorized tree.
886 IRBuilder<> Builder;
887 };
888
889 #ifndef NDEBUG
operator <<(raw_ostream & os,const BoUpSLP::ScheduleData & SD)890 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
891 SD.dump(os);
892 return os;
893 }
894 #endif
895
buildTree(ArrayRef<Value * > Roots,ArrayRef<Value * > UserIgnoreLst)896 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
897 ArrayRef<Value *> UserIgnoreLst) {
898 deleteTree();
899 UserIgnoreList = UserIgnoreLst;
900 if (!getSameType(Roots))
901 return;
902 buildTree_rec(Roots, 0);
903
904 // Collect the values that we need to extract from the tree.
905 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
906 TreeEntry *Entry = &VectorizableTree[EIdx];
907
908 // For each lane:
909 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
910 Value *Scalar = Entry->Scalars[Lane];
911
912 // No need to handle users of gathered values.
913 if (Entry->NeedToGather)
914 continue;
915
916 for (User *U : Scalar->users()) {
917 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
918
919 Instruction *UserInst = dyn_cast<Instruction>(U);
920 if (!UserInst)
921 continue;
922
923 // Skip in-tree scalars that become vectors
924 if (ScalarToTreeEntry.count(U)) {
925 int Idx = ScalarToTreeEntry[U];
926 TreeEntry *UseEntry = &VectorizableTree[Idx];
927 Value *UseScalar = UseEntry->Scalars[0];
928 // Some in-tree scalars will remain as scalar in vectorized
929 // instructions. If that is the case, the one in Lane 0 will
930 // be used.
931 if (UseScalar != U ||
932 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
933 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
934 << ".\n");
935 assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
936 continue;
937 }
938 }
939
940 // Ignore users in the user ignore list.
941 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
942 UserIgnoreList.end())
943 continue;
944
945 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
946 Lane << " from " << *Scalar << ".\n");
947 ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
948 }
949 }
950 }
951 }
952
953
buildTree_rec(ArrayRef<Value * > VL,unsigned Depth)954 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
955 bool SameTy = getSameType(VL); (void)SameTy;
956 bool isAltShuffle = false;
957 assert(SameTy && "Invalid types!");
958
959 if (Depth == RecursionMaxDepth) {
960 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
961 newTreeEntry(VL, false);
962 return;
963 }
964
965 // Don't handle vectors.
966 if (VL[0]->getType()->isVectorTy()) {
967 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
968 newTreeEntry(VL, false);
969 return;
970 }
971
972 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
973 if (SI->getValueOperand()->getType()->isVectorTy()) {
974 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
975 newTreeEntry(VL, false);
976 return;
977 }
978 unsigned Opcode = getSameOpcode(VL);
979
980 // Check that this shuffle vector refers to the alternate
981 // sequence of opcodes.
982 if (Opcode == Instruction::ShuffleVector) {
983 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
984 unsigned Op = I0->getOpcode();
985 if (Op != Instruction::ShuffleVector)
986 isAltShuffle = true;
987 }
988
989 // If all of the operands are identical or constant we have a simple solution.
990 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
991 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
992 newTreeEntry(VL, false);
993 return;
994 }
995
996 // We now know that this is a vector of instructions of the same type from
997 // the same block.
998
999 // Don't vectorize ephemeral values.
1000 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1001 if (EphValues.count(VL[i])) {
1002 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1003 ") is ephemeral.\n");
1004 newTreeEntry(VL, false);
1005 return;
1006 }
1007 }
1008
1009 // Check if this is a duplicate of another entry.
1010 if (ScalarToTreeEntry.count(VL[0])) {
1011 int Idx = ScalarToTreeEntry[VL[0]];
1012 TreeEntry *E = &VectorizableTree[Idx];
1013 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1014 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1015 if (E->Scalars[i] != VL[i]) {
1016 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1017 newTreeEntry(VL, false);
1018 return;
1019 }
1020 }
1021 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1022 return;
1023 }
1024
1025 // Check that none of the instructions in the bundle are already in the tree.
1026 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1027 if (ScalarToTreeEntry.count(VL[i])) {
1028 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1029 ") is already in tree.\n");
1030 newTreeEntry(VL, false);
1031 return;
1032 }
1033 }
1034
1035 // If any of the scalars is marked as a value that needs to stay scalar then
1036 // we need to gather the scalars.
1037 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1038 if (MustGather.count(VL[i])) {
1039 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1040 newTreeEntry(VL, false);
1041 return;
1042 }
1043 }
1044
1045 // Check that all of the users of the scalars that we want to vectorize are
1046 // schedulable.
1047 Instruction *VL0 = cast<Instruction>(VL[0]);
1048 BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1049
1050 if (!DT->isReachableFromEntry(BB)) {
1051 // Don't go into unreachable blocks. They may contain instructions with
1052 // dependency cycles which confuse the final scheduling.
1053 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1054 newTreeEntry(VL, false);
1055 return;
1056 }
1057
1058 // Check that every instructions appears once in this bundle.
1059 for (unsigned i = 0, e = VL.size(); i < e; ++i)
1060 for (unsigned j = i+1; j < e; ++j)
1061 if (VL[i] == VL[j]) {
1062 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1063 newTreeEntry(VL, false);
1064 return;
1065 }
1066
1067 auto &BSRef = BlocksSchedules[BB];
1068 if (!BSRef) {
1069 BSRef = llvm::make_unique<BlockScheduling>(BB);
1070 }
1071 BlockScheduling &BS = *BSRef.get();
1072
1073 if (!BS.tryScheduleBundle(VL, this)) {
1074 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1075 BS.cancelScheduling(VL);
1076 newTreeEntry(VL, false);
1077 return;
1078 }
1079 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1080
1081 switch (Opcode) {
1082 case Instruction::PHI: {
1083 PHINode *PH = dyn_cast<PHINode>(VL0);
1084
1085 // Check for terminator values (e.g. invoke).
1086 for (unsigned j = 0; j < VL.size(); ++j)
1087 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1088 TerminatorInst *Term = dyn_cast<TerminatorInst>(
1089 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1090 if (Term) {
1091 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1092 BS.cancelScheduling(VL);
1093 newTreeEntry(VL, false);
1094 return;
1095 }
1096 }
1097
1098 newTreeEntry(VL, true);
1099 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1100
1101 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1102 ValueList Operands;
1103 // Prepare the operand vector.
1104 for (unsigned j = 0; j < VL.size(); ++j)
1105 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1106 PH->getIncomingBlock(i)));
1107
1108 buildTree_rec(Operands, Depth + 1);
1109 }
1110 return;
1111 }
1112 case Instruction::ExtractElement: {
1113 bool Reuse = CanReuseExtract(VL);
1114 if (Reuse) {
1115 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1116 } else {
1117 BS.cancelScheduling(VL);
1118 }
1119 newTreeEntry(VL, Reuse);
1120 return;
1121 }
1122 case Instruction::Load: {
1123 // Check if the loads are consecutive or of we need to swizzle them.
1124 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1125 LoadInst *L = cast<LoadInst>(VL[i]);
1126 if (!L->isSimple()) {
1127 BS.cancelScheduling(VL);
1128 newTreeEntry(VL, false);
1129 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1130 return;
1131 }
1132 const DataLayout &DL = F->getParent()->getDataLayout();
1133 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1134 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
1135 ++NumLoadsWantToChangeOrder;
1136 }
1137 BS.cancelScheduling(VL);
1138 newTreeEntry(VL, false);
1139 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1140 return;
1141 }
1142 }
1143 ++NumLoadsWantToKeepOrder;
1144 newTreeEntry(VL, true);
1145 DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1146 return;
1147 }
1148 case Instruction::ZExt:
1149 case Instruction::SExt:
1150 case Instruction::FPToUI:
1151 case Instruction::FPToSI:
1152 case Instruction::FPExt:
1153 case Instruction::PtrToInt:
1154 case Instruction::IntToPtr:
1155 case Instruction::SIToFP:
1156 case Instruction::UIToFP:
1157 case Instruction::Trunc:
1158 case Instruction::FPTrunc:
1159 case Instruction::BitCast: {
1160 Type *SrcTy = VL0->getOperand(0)->getType();
1161 for (unsigned i = 0; i < VL.size(); ++i) {
1162 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1163 if (Ty != SrcTy || !isValidElementType(Ty)) {
1164 BS.cancelScheduling(VL);
1165 newTreeEntry(VL, false);
1166 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1167 return;
1168 }
1169 }
1170 newTreeEntry(VL, true);
1171 DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1172
1173 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1174 ValueList Operands;
1175 // Prepare the operand vector.
1176 for (unsigned j = 0; j < VL.size(); ++j)
1177 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1178
1179 buildTree_rec(Operands, Depth+1);
1180 }
1181 return;
1182 }
1183 case Instruction::ICmp:
1184 case Instruction::FCmp: {
1185 // Check that all of the compares have the same predicate.
1186 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1187 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1188 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1189 CmpInst *Cmp = cast<CmpInst>(VL[i]);
1190 if (Cmp->getPredicate() != P0 ||
1191 Cmp->getOperand(0)->getType() != ComparedTy) {
1192 BS.cancelScheduling(VL);
1193 newTreeEntry(VL, false);
1194 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1195 return;
1196 }
1197 }
1198
1199 newTreeEntry(VL, true);
1200 DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1201
1202 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1203 ValueList Operands;
1204 // Prepare the operand vector.
1205 for (unsigned j = 0; j < VL.size(); ++j)
1206 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1207
1208 buildTree_rec(Operands, Depth+1);
1209 }
1210 return;
1211 }
1212 case Instruction::Select:
1213 case Instruction::Add:
1214 case Instruction::FAdd:
1215 case Instruction::Sub:
1216 case Instruction::FSub:
1217 case Instruction::Mul:
1218 case Instruction::FMul:
1219 case Instruction::UDiv:
1220 case Instruction::SDiv:
1221 case Instruction::FDiv:
1222 case Instruction::URem:
1223 case Instruction::SRem:
1224 case Instruction::FRem:
1225 case Instruction::Shl:
1226 case Instruction::LShr:
1227 case Instruction::AShr:
1228 case Instruction::And:
1229 case Instruction::Or:
1230 case Instruction::Xor: {
1231 newTreeEntry(VL, true);
1232 DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1233
1234 // Sort operands of the instructions so that each side is more likely to
1235 // have the same opcode.
1236 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1237 ValueList Left, Right;
1238 reorderInputsAccordingToOpcode(VL, Left, Right);
1239 buildTree_rec(Left, Depth + 1);
1240 buildTree_rec(Right, Depth + 1);
1241 return;
1242 }
1243
1244 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1245 ValueList Operands;
1246 // Prepare the operand vector.
1247 for (unsigned j = 0; j < VL.size(); ++j)
1248 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1249
1250 buildTree_rec(Operands, Depth+1);
1251 }
1252 return;
1253 }
1254 case Instruction::GetElementPtr: {
1255 // We don't combine GEPs with complicated (nested) indexing.
1256 for (unsigned j = 0; j < VL.size(); ++j) {
1257 if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1258 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1259 BS.cancelScheduling(VL);
1260 newTreeEntry(VL, false);
1261 return;
1262 }
1263 }
1264
1265 // We can't combine several GEPs into one vector if they operate on
1266 // different types.
1267 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1268 for (unsigned j = 0; j < VL.size(); ++j) {
1269 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1270 if (Ty0 != CurTy) {
1271 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1272 BS.cancelScheduling(VL);
1273 newTreeEntry(VL, false);
1274 return;
1275 }
1276 }
1277
1278 // We don't combine GEPs with non-constant indexes.
1279 for (unsigned j = 0; j < VL.size(); ++j) {
1280 auto Op = cast<Instruction>(VL[j])->getOperand(1);
1281 if (!isa<ConstantInt>(Op)) {
1282 DEBUG(
1283 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1284 BS.cancelScheduling(VL);
1285 newTreeEntry(VL, false);
1286 return;
1287 }
1288 }
1289
1290 newTreeEntry(VL, true);
1291 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1292 for (unsigned i = 0, e = 2; i < e; ++i) {
1293 ValueList Operands;
1294 // Prepare the operand vector.
1295 for (unsigned j = 0; j < VL.size(); ++j)
1296 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1297
1298 buildTree_rec(Operands, Depth + 1);
1299 }
1300 return;
1301 }
1302 case Instruction::Store: {
1303 const DataLayout &DL = F->getParent()->getDataLayout();
1304 // Check if the stores are consecutive or of we need to swizzle them.
1305 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1306 if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
1307 BS.cancelScheduling(VL);
1308 newTreeEntry(VL, false);
1309 DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1310 return;
1311 }
1312
1313 newTreeEntry(VL, true);
1314 DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1315
1316 ValueList Operands;
1317 for (unsigned j = 0; j < VL.size(); ++j)
1318 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1319
1320 buildTree_rec(Operands, Depth + 1);
1321 return;
1322 }
1323 case Instruction::Call: {
1324 // Check if the calls are all to the same vectorizable intrinsic.
1325 CallInst *CI = cast<CallInst>(VL[0]);
1326 // Check if this is an Intrinsic call or something that can be
1327 // represented by an intrinsic call
1328 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1329 if (!isTriviallyVectorizable(ID)) {
1330 BS.cancelScheduling(VL);
1331 newTreeEntry(VL, false);
1332 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1333 return;
1334 }
1335 Function *Int = CI->getCalledFunction();
1336 Value *A1I = nullptr;
1337 if (hasVectorInstrinsicScalarOpd(ID, 1))
1338 A1I = CI->getArgOperand(1);
1339 for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1340 CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1341 if (!CI2 || CI2->getCalledFunction() != Int ||
1342 getIntrinsicIDForCall(CI2, TLI) != ID) {
1343 BS.cancelScheduling(VL);
1344 newTreeEntry(VL, false);
1345 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1346 << "\n");
1347 return;
1348 }
1349 // ctlz,cttz and powi are special intrinsics whose second argument
1350 // should be same in order for them to be vectorized.
1351 if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1352 Value *A1J = CI2->getArgOperand(1);
1353 if (A1I != A1J) {
1354 BS.cancelScheduling(VL);
1355 newTreeEntry(VL, false);
1356 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1357 << " argument "<< A1I<<"!=" << A1J
1358 << "\n");
1359 return;
1360 }
1361 }
1362 }
1363
1364 newTreeEntry(VL, true);
1365 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1366 ValueList Operands;
1367 // Prepare the operand vector.
1368 for (unsigned j = 0; j < VL.size(); ++j) {
1369 CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1370 Operands.push_back(CI2->getArgOperand(i));
1371 }
1372 buildTree_rec(Operands, Depth + 1);
1373 }
1374 return;
1375 }
1376 case Instruction::ShuffleVector: {
1377 // If this is not an alternate sequence of opcode like add-sub
1378 // then do not vectorize this instruction.
1379 if (!isAltShuffle) {
1380 BS.cancelScheduling(VL);
1381 newTreeEntry(VL, false);
1382 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1383 return;
1384 }
1385 newTreeEntry(VL, true);
1386 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1387
1388 // Reorder operands if reordering would enable vectorization.
1389 if (isa<BinaryOperator>(VL0)) {
1390 ValueList Left, Right;
1391 reorderAltShuffleOperands(VL, Left, Right);
1392 buildTree_rec(Left, Depth + 1);
1393 buildTree_rec(Right, Depth + 1);
1394 return;
1395 }
1396
1397 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1398 ValueList Operands;
1399 // Prepare the operand vector.
1400 for (unsigned j = 0; j < VL.size(); ++j)
1401 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1402
1403 buildTree_rec(Operands, Depth + 1);
1404 }
1405 return;
1406 }
1407 default:
1408 BS.cancelScheduling(VL);
1409 newTreeEntry(VL, false);
1410 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1411 return;
1412 }
1413 }
1414
getEntryCost(TreeEntry * E)1415 int BoUpSLP::getEntryCost(TreeEntry *E) {
1416 ArrayRef<Value*> VL = E->Scalars;
1417
1418 Type *ScalarTy = VL[0]->getType();
1419 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1420 ScalarTy = SI->getValueOperand()->getType();
1421 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1422
1423 if (E->NeedToGather) {
1424 if (allConstant(VL))
1425 return 0;
1426 if (isSplat(VL)) {
1427 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1428 }
1429 return getGatherCost(E->Scalars);
1430 }
1431 unsigned Opcode = getSameOpcode(VL);
1432 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1433 Instruction *VL0 = cast<Instruction>(VL[0]);
1434 switch (Opcode) {
1435 case Instruction::PHI: {
1436 return 0;
1437 }
1438 case Instruction::ExtractElement: {
1439 if (CanReuseExtract(VL)) {
1440 int DeadCost = 0;
1441 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1442 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1443 if (E->hasOneUse())
1444 // Take credit for instruction that will become dead.
1445 DeadCost +=
1446 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1447 }
1448 return -DeadCost;
1449 }
1450 return getGatherCost(VecTy);
1451 }
1452 case Instruction::ZExt:
1453 case Instruction::SExt:
1454 case Instruction::FPToUI:
1455 case Instruction::FPToSI:
1456 case Instruction::FPExt:
1457 case Instruction::PtrToInt:
1458 case Instruction::IntToPtr:
1459 case Instruction::SIToFP:
1460 case Instruction::UIToFP:
1461 case Instruction::Trunc:
1462 case Instruction::FPTrunc:
1463 case Instruction::BitCast: {
1464 Type *SrcTy = VL0->getOperand(0)->getType();
1465
1466 // Calculate the cost of this instruction.
1467 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1468 VL0->getType(), SrcTy);
1469
1470 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1471 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1472 return VecCost - ScalarCost;
1473 }
1474 case Instruction::FCmp:
1475 case Instruction::ICmp:
1476 case Instruction::Select:
1477 case Instruction::Add:
1478 case Instruction::FAdd:
1479 case Instruction::Sub:
1480 case Instruction::FSub:
1481 case Instruction::Mul:
1482 case Instruction::FMul:
1483 case Instruction::UDiv:
1484 case Instruction::SDiv:
1485 case Instruction::FDiv:
1486 case Instruction::URem:
1487 case Instruction::SRem:
1488 case Instruction::FRem:
1489 case Instruction::Shl:
1490 case Instruction::LShr:
1491 case Instruction::AShr:
1492 case Instruction::And:
1493 case Instruction::Or:
1494 case Instruction::Xor: {
1495 // Calculate the cost of this instruction.
1496 int ScalarCost = 0;
1497 int VecCost = 0;
1498 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1499 Opcode == Instruction::Select) {
1500 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1501 ScalarCost = VecTy->getNumElements() *
1502 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1503 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1504 } else {
1505 // Certain instructions can be cheaper to vectorize if they have a
1506 // constant second vector operand.
1507 TargetTransformInfo::OperandValueKind Op1VK =
1508 TargetTransformInfo::OK_AnyValue;
1509 TargetTransformInfo::OperandValueKind Op2VK =
1510 TargetTransformInfo::OK_UniformConstantValue;
1511 TargetTransformInfo::OperandValueProperties Op1VP =
1512 TargetTransformInfo::OP_None;
1513 TargetTransformInfo::OperandValueProperties Op2VP =
1514 TargetTransformInfo::OP_None;
1515
1516 // If all operands are exactly the same ConstantInt then set the
1517 // operand kind to OK_UniformConstantValue.
1518 // If instead not all operands are constants, then set the operand kind
1519 // to OK_AnyValue. If all operands are constants but not the same,
1520 // then set the operand kind to OK_NonUniformConstantValue.
1521 ConstantInt *CInt = nullptr;
1522 for (unsigned i = 0; i < VL.size(); ++i) {
1523 const Instruction *I = cast<Instruction>(VL[i]);
1524 if (!isa<ConstantInt>(I->getOperand(1))) {
1525 Op2VK = TargetTransformInfo::OK_AnyValue;
1526 break;
1527 }
1528 if (i == 0) {
1529 CInt = cast<ConstantInt>(I->getOperand(1));
1530 continue;
1531 }
1532 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1533 CInt != cast<ConstantInt>(I->getOperand(1)))
1534 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1535 }
1536 // FIXME: Currently cost of model modification for division by
1537 // power of 2 is handled only for X86. Add support for other targets.
1538 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1539 CInt->getValue().isPowerOf2())
1540 Op2VP = TargetTransformInfo::OP_PowerOf2;
1541
1542 ScalarCost = VecTy->getNumElements() *
1543 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1544 Op1VP, Op2VP);
1545 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1546 Op1VP, Op2VP);
1547 }
1548 return VecCost - ScalarCost;
1549 }
1550 case Instruction::GetElementPtr: {
1551 TargetTransformInfo::OperandValueKind Op1VK =
1552 TargetTransformInfo::OK_AnyValue;
1553 TargetTransformInfo::OperandValueKind Op2VK =
1554 TargetTransformInfo::OK_UniformConstantValue;
1555
1556 int ScalarCost =
1557 VecTy->getNumElements() *
1558 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1559 int VecCost =
1560 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1561
1562 return VecCost - ScalarCost;
1563 }
1564 case Instruction::Load: {
1565 // Cost of wide load - cost of scalar loads.
1566 int ScalarLdCost = VecTy->getNumElements() *
1567 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1568 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1569 return VecLdCost - ScalarLdCost;
1570 }
1571 case Instruction::Store: {
1572 // We know that we can merge the stores. Calculate the cost.
1573 int ScalarStCost = VecTy->getNumElements() *
1574 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1575 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1576 return VecStCost - ScalarStCost;
1577 }
1578 case Instruction::Call: {
1579 CallInst *CI = cast<CallInst>(VL0);
1580 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1581
1582 // Calculate the cost of the scalar and vector calls.
1583 SmallVector<Type*, 4> ScalarTys, VecTys;
1584 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1585 ScalarTys.push_back(CI->getArgOperand(op)->getType());
1586 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1587 VecTy->getNumElements()));
1588 }
1589
1590 int ScalarCallCost = VecTy->getNumElements() *
1591 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1592
1593 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1594
1595 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1596 << " (" << VecCallCost << "-" << ScalarCallCost << ")"
1597 << " for " << *CI << "\n");
1598
1599 return VecCallCost - ScalarCallCost;
1600 }
1601 case Instruction::ShuffleVector: {
1602 TargetTransformInfo::OperandValueKind Op1VK =
1603 TargetTransformInfo::OK_AnyValue;
1604 TargetTransformInfo::OperandValueKind Op2VK =
1605 TargetTransformInfo::OK_AnyValue;
1606 int ScalarCost = 0;
1607 int VecCost = 0;
1608 for (unsigned i = 0; i < VL.size(); ++i) {
1609 Instruction *I = cast<Instruction>(VL[i]);
1610 if (!I)
1611 break;
1612 ScalarCost +=
1613 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1614 }
1615 // VecCost is equal to sum of the cost of creating 2 vectors
1616 // and the cost of creating shuffle.
1617 Instruction *I0 = cast<Instruction>(VL[0]);
1618 VecCost =
1619 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1620 Instruction *I1 = cast<Instruction>(VL[1]);
1621 VecCost +=
1622 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1623 VecCost +=
1624 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1625 return VecCost - ScalarCost;
1626 }
1627 default:
1628 llvm_unreachable("Unknown instruction");
1629 }
1630 }
1631
isFullyVectorizableTinyTree()1632 bool BoUpSLP::isFullyVectorizableTinyTree() {
1633 DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1634 VectorizableTree.size() << " is fully vectorizable .\n");
1635
1636 // We only handle trees of height 2.
1637 if (VectorizableTree.size() != 2)
1638 return false;
1639
1640 // Handle splat stores.
1641 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
1642 return true;
1643
1644 // Gathering cost would be too much for tiny trees.
1645 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1646 return false;
1647
1648 return true;
1649 }
1650
getSpillCost()1651 int BoUpSLP::getSpillCost() {
1652 // Walk from the bottom of the tree to the top, tracking which values are
1653 // live. When we see a call instruction that is not part of our tree,
1654 // query TTI to see if there is a cost to keeping values live over it
1655 // (for example, if spills and fills are required).
1656 unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1657 int Cost = 0;
1658
1659 SmallPtrSet<Instruction*, 4> LiveValues;
1660 Instruction *PrevInst = nullptr;
1661
1662 for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1663 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1664 if (!Inst)
1665 continue;
1666
1667 if (!PrevInst) {
1668 PrevInst = Inst;
1669 continue;
1670 }
1671
1672 DEBUG(
1673 dbgs() << "SLP: #LV: " << LiveValues.size();
1674 for (auto *X : LiveValues)
1675 dbgs() << " " << X->getName();
1676 dbgs() << ", Looking at ";
1677 Inst->dump();
1678 );
1679
1680 // Update LiveValues.
1681 LiveValues.erase(PrevInst);
1682 for (auto &J : PrevInst->operands()) {
1683 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1684 LiveValues.insert(cast<Instruction>(&*J));
1685 }
1686
1687 // Now find the sequence of instructions between PrevInst and Inst.
1688 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1689 --PrevInstIt;
1690 while (InstIt != PrevInstIt) {
1691 if (PrevInstIt == PrevInst->getParent()->rend()) {
1692 PrevInstIt = Inst->getParent()->rbegin();
1693 continue;
1694 }
1695
1696 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1697 SmallVector<Type*, 4> V;
1698 for (auto *II : LiveValues)
1699 V.push_back(VectorType::get(II->getType(), BundleWidth));
1700 Cost += TTI->getCostOfKeepingLiveOverCall(V);
1701 }
1702
1703 ++PrevInstIt;
1704 }
1705
1706 PrevInst = Inst;
1707 }
1708
1709 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1710 return Cost;
1711 }
1712
getTreeCost()1713 int BoUpSLP::getTreeCost() {
1714 int Cost = 0;
1715 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1716 VectorizableTree.size() << ".\n");
1717
1718 // We only vectorize tiny trees if it is fully vectorizable.
1719 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1720 if (VectorizableTree.empty()) {
1721 assert(!ExternalUses.size() && "We should not have any external users");
1722 }
1723 return INT_MAX;
1724 }
1725
1726 unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1727
1728 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1729 int C = getEntryCost(&VectorizableTree[i]);
1730 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1731 << *VectorizableTree[i].Scalars[0] << " .\n");
1732 Cost += C;
1733 }
1734
1735 SmallSet<Value *, 16> ExtractCostCalculated;
1736 int ExtractCost = 0;
1737 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1738 I != E; ++I) {
1739 // We only add extract cost once for the same scalar.
1740 if (!ExtractCostCalculated.insert(I->Scalar).second)
1741 continue;
1742
1743 // Uses by ephemeral values are free (because the ephemeral value will be
1744 // removed prior to code generation, and so the extraction will be
1745 // removed as well).
1746 if (EphValues.count(I->User))
1747 continue;
1748
1749 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1750 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1751 I->Lane);
1752 }
1753
1754 Cost += getSpillCost();
1755
1756 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1757 return Cost + ExtractCost;
1758 }
1759
getGatherCost(Type * Ty)1760 int BoUpSLP::getGatherCost(Type *Ty) {
1761 int Cost = 0;
1762 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1763 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1764 return Cost;
1765 }
1766
getGatherCost(ArrayRef<Value * > VL)1767 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1768 // Find the type of the operands in VL.
1769 Type *ScalarTy = VL[0]->getType();
1770 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1771 ScalarTy = SI->getValueOperand()->getType();
1772 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1773 // Find the cost of inserting/extracting values from the vector.
1774 return getGatherCost(VecTy);
1775 }
1776
getPointerOperand(Value * I)1777 Value *BoUpSLP::getPointerOperand(Value *I) {
1778 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1779 return LI->getPointerOperand();
1780 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1781 return SI->getPointerOperand();
1782 return nullptr;
1783 }
1784
getAddressSpaceOperand(Value * I)1785 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1786 if (LoadInst *L = dyn_cast<LoadInst>(I))
1787 return L->getPointerAddressSpace();
1788 if (StoreInst *S = dyn_cast<StoreInst>(I))
1789 return S->getPointerAddressSpace();
1790 return -1;
1791 }
1792
isConsecutiveAccess(Value * A,Value * B,const DataLayout & DL)1793 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
1794 Value *PtrA = getPointerOperand(A);
1795 Value *PtrB = getPointerOperand(B);
1796 unsigned ASA = getAddressSpaceOperand(A);
1797 unsigned ASB = getAddressSpaceOperand(B);
1798
1799 // Check that the address spaces match and that the pointers are valid.
1800 if (!PtrA || !PtrB || (ASA != ASB))
1801 return false;
1802
1803 // Make sure that A and B are different pointers of the same type.
1804 if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1805 return false;
1806
1807 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
1808 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1809 APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
1810
1811 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1812 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1813 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1814
1815 APInt OffsetDelta = OffsetB - OffsetA;
1816
1817 // Check if they are based on the same pointer. That makes the offsets
1818 // sufficient.
1819 if (PtrA == PtrB)
1820 return OffsetDelta == Size;
1821
1822 // Compute the necessary base pointer delta to have the necessary final delta
1823 // equal to the size.
1824 APInt BaseDelta = Size - OffsetDelta;
1825
1826 // Otherwise compute the distance with SCEV between the base pointers.
1827 const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1828 const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1829 const SCEV *C = SE->getConstant(BaseDelta);
1830 const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1831 return X == PtrSCEVB;
1832 }
1833
1834 // Reorder commutative operations in alternate shuffle if the resulting vectors
1835 // are consecutive loads. This would allow us to vectorize the tree.
1836 // If we have something like-
1837 // load a[0] - load b[0]
1838 // load b[1] + load a[1]
1839 // load a[2] - load b[2]
1840 // load a[3] + load b[3]
1841 // Reordering the second load b[1] load a[1] would allow us to vectorize this
1842 // code.
reorderAltShuffleOperands(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right)1843 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1844 SmallVectorImpl<Value *> &Left,
1845 SmallVectorImpl<Value *> &Right) {
1846 const DataLayout &DL = F->getParent()->getDataLayout();
1847
1848 // Push left and right operands of binary operation into Left and Right
1849 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1850 Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
1851 Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
1852 }
1853
1854 // Reorder if we have a commutative operation and consecutive access
1855 // are on either side of the alternate instructions.
1856 for (unsigned j = 0; j < VL.size() - 1; ++j) {
1857 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1858 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1859 Instruction *VL1 = cast<Instruction>(VL[j]);
1860 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1861 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1862 std::swap(Left[j], Right[j]);
1863 continue;
1864 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1865 std::swap(Left[j + 1], Right[j + 1]);
1866 continue;
1867 }
1868 // else unchanged
1869 }
1870 }
1871 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
1872 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
1873 Instruction *VL1 = cast<Instruction>(VL[j]);
1874 Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1875 if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
1876 std::swap(Left[j], Right[j]);
1877 continue;
1878 } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
1879 std::swap(Left[j + 1], Right[j + 1]);
1880 continue;
1881 }
1882 // else unchanged
1883 }
1884 }
1885 }
1886 }
1887
reorderInputsAccordingToOpcode(ArrayRef<Value * > VL,SmallVectorImpl<Value * > & Left,SmallVectorImpl<Value * > & Right)1888 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1889 SmallVectorImpl<Value *> &Left,
1890 SmallVectorImpl<Value *> &Right) {
1891
1892 SmallVector<Value *, 16> OrigLeft, OrigRight;
1893
1894 bool AllSameOpcodeLeft = true;
1895 bool AllSameOpcodeRight = true;
1896 for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1897 Instruction *I = cast<Instruction>(VL[i]);
1898 Value *VLeft = I->getOperand(0);
1899 Value *VRight = I->getOperand(1);
1900
1901 OrigLeft.push_back(VLeft);
1902 OrigRight.push_back(VRight);
1903
1904 Instruction *ILeft = dyn_cast<Instruction>(VLeft);
1905 Instruction *IRight = dyn_cast<Instruction>(VRight);
1906
1907 // Check whether all operands on one side have the same opcode. In this case
1908 // we want to preserve the original order and not make things worse by
1909 // reordering.
1910 if (i && AllSameOpcodeLeft && ILeft) {
1911 if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
1912 if (PLeft->getOpcode() != ILeft->getOpcode())
1913 AllSameOpcodeLeft = false;
1914 } else
1915 AllSameOpcodeLeft = false;
1916 }
1917 if (i && AllSameOpcodeRight && IRight) {
1918 if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
1919 if (PRight->getOpcode() != IRight->getOpcode())
1920 AllSameOpcodeRight = false;
1921 } else
1922 AllSameOpcodeRight = false;
1923 }
1924
1925 // Sort two opcodes. In the code below we try to preserve the ability to use
1926 // broadcast of values instead of individual inserts.
1927 // vl1 = load
1928 // vl2 = phi
1929 // vr1 = load
1930 // vr2 = vr2
1931 // = vl1 x vr1
1932 // = vl2 x vr2
1933 // If we just sorted according to opcode we would leave the first line in
1934 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
1935 // = vl1 x vr1
1936 // = vr2 x vl2
1937 // Because vr2 and vr1 are from the same load we loose the opportunity of a
1938 // broadcast for the packed right side in the backend: we have [vr1, vl2]
1939 // instead of [vr1, vr2=vr1].
1940 if (ILeft && IRight) {
1941 if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
1942 Left.push_back(IRight);
1943 Right.push_back(ILeft);
1944 } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
1945 Right[i - 1] != IRight) {
1946 // Try not to destroy a broad cast for no apparent benefit.
1947 Left.push_back(IRight);
1948 Right.push_back(ILeft);
1949 } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1950 Right[i - 1] == ILeft) {
1951 // Try preserve broadcasts.
1952 Left.push_back(IRight);
1953 Right.push_back(ILeft);
1954 } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
1955 Left[i - 1] == IRight) {
1956 // Try preserve broadcasts.
1957 Left.push_back(IRight);
1958 Right.push_back(ILeft);
1959 } else {
1960 Left.push_back(ILeft);
1961 Right.push_back(IRight);
1962 }
1963 continue;
1964 }
1965 // One opcode, put the instruction on the right.
1966 if (ILeft) {
1967 Left.push_back(VRight);
1968 Right.push_back(ILeft);
1969 continue;
1970 }
1971 Left.push_back(VLeft);
1972 Right.push_back(VRight);
1973 }
1974
1975 bool LeftBroadcast = isSplat(Left);
1976 bool RightBroadcast = isSplat(Right);
1977
1978 // If operands end up being broadcast return this operand order.
1979 if (LeftBroadcast || RightBroadcast)
1980 return;
1981
1982 // Don't reorder if the operands where good to begin.
1983 if (AllSameOpcodeRight || AllSameOpcodeLeft) {
1984 Left = OrigLeft;
1985 Right = OrigRight;
1986 }
1987
1988 const DataLayout &DL = F->getParent()->getDataLayout();
1989
1990 // Finally check if we can get longer vectorizable chain by reordering
1991 // without breaking the good operand order detected above.
1992 // E.g. If we have something like-
1993 // load a[0] load b[0]
1994 // load b[1] load a[1]
1995 // load a[2] load b[2]
1996 // load a[3] load b[3]
1997 // Reordering the second load b[1] load a[1] would allow us to vectorize
1998 // this code and we still retain AllSameOpcode property.
1999 // FIXME: This load reordering might break AllSameOpcode in some rare cases
2000 // such as-
2001 // add a[0],c[0] load b[0]
2002 // add a[1],c[2] load b[1]
2003 // b[2] load b[2]
2004 // add a[3],c[3] load b[3]
2005 for (unsigned j = 0; j < VL.size() - 1; ++j) {
2006 if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2007 if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2008 if (isConsecutiveAccess(L, L1, DL)) {
2009 std::swap(Left[j + 1], Right[j + 1]);
2010 continue;
2011 }
2012 }
2013 }
2014 if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2015 if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2016 if (isConsecutiveAccess(L, L1, DL)) {
2017 std::swap(Left[j + 1], Right[j + 1]);
2018 continue;
2019 }
2020 }
2021 }
2022 // else unchanged
2023 }
2024 }
2025
setInsertPointAfterBundle(ArrayRef<Value * > VL)2026 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2027 Instruction *VL0 = cast<Instruction>(VL[0]);
2028 BasicBlock::iterator NextInst = VL0;
2029 ++NextInst;
2030 Builder.SetInsertPoint(VL0->getParent(), NextInst);
2031 Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2032 }
2033
Gather(ArrayRef<Value * > VL,VectorType * Ty)2034 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2035 Value *Vec = UndefValue::get(Ty);
2036 // Generate the 'InsertElement' instruction.
2037 for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2038 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2039 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2040 GatherSeq.insert(Insrt);
2041 CSEBlocks.insert(Insrt->getParent());
2042
2043 // Add to our 'need-to-extract' list.
2044 if (ScalarToTreeEntry.count(VL[i])) {
2045 int Idx = ScalarToTreeEntry[VL[i]];
2046 TreeEntry *E = &VectorizableTree[Idx];
2047 // Find which lane we need to extract.
2048 int FoundLane = -1;
2049 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2050 // Is this the lane of the scalar that we are looking for ?
2051 if (E->Scalars[Lane] == VL[i]) {
2052 FoundLane = Lane;
2053 break;
2054 }
2055 }
2056 assert(FoundLane >= 0 && "Could not find the correct lane");
2057 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2058 }
2059 }
2060 }
2061
2062 return Vec;
2063 }
2064
alreadyVectorized(ArrayRef<Value * > VL) const2065 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2066 SmallDenseMap<Value*, int>::const_iterator Entry
2067 = ScalarToTreeEntry.find(VL[0]);
2068 if (Entry != ScalarToTreeEntry.end()) {
2069 int Idx = Entry->second;
2070 const TreeEntry *En = &VectorizableTree[Idx];
2071 if (En->isSame(VL) && En->VectorizedValue)
2072 return En->VectorizedValue;
2073 }
2074 return nullptr;
2075 }
2076
vectorizeTree(ArrayRef<Value * > VL)2077 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2078 if (ScalarToTreeEntry.count(VL[0])) {
2079 int Idx = ScalarToTreeEntry[VL[0]];
2080 TreeEntry *E = &VectorizableTree[Idx];
2081 if (E->isSame(VL))
2082 return vectorizeTree(E);
2083 }
2084
2085 Type *ScalarTy = VL[0]->getType();
2086 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2087 ScalarTy = SI->getValueOperand()->getType();
2088 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2089
2090 return Gather(VL, VecTy);
2091 }
2092
vectorizeTree(TreeEntry * E)2093 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2094 IRBuilder<>::InsertPointGuard Guard(Builder);
2095
2096 if (E->VectorizedValue) {
2097 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2098 return E->VectorizedValue;
2099 }
2100
2101 Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2102 Type *ScalarTy = VL0->getType();
2103 if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2104 ScalarTy = SI->getValueOperand()->getType();
2105 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2106
2107 if (E->NeedToGather) {
2108 setInsertPointAfterBundle(E->Scalars);
2109 return Gather(E->Scalars, VecTy);
2110 }
2111
2112 const DataLayout &DL = F->getParent()->getDataLayout();
2113 unsigned Opcode = getSameOpcode(E->Scalars);
2114
2115 switch (Opcode) {
2116 case Instruction::PHI: {
2117 PHINode *PH = dyn_cast<PHINode>(VL0);
2118 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2119 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2120 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2121 E->VectorizedValue = NewPhi;
2122
2123 // PHINodes may have multiple entries from the same block. We want to
2124 // visit every block once.
2125 SmallSet<BasicBlock*, 4> VisitedBBs;
2126
2127 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2128 ValueList Operands;
2129 BasicBlock *IBB = PH->getIncomingBlock(i);
2130
2131 if (!VisitedBBs.insert(IBB).second) {
2132 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2133 continue;
2134 }
2135
2136 // Prepare the operand vector.
2137 for (unsigned j = 0; j < E->Scalars.size(); ++j)
2138 Operands.push_back(cast<PHINode>(E->Scalars[j])->
2139 getIncomingValueForBlock(IBB));
2140
2141 Builder.SetInsertPoint(IBB->getTerminator());
2142 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2143 Value *Vec = vectorizeTree(Operands);
2144 NewPhi->addIncoming(Vec, IBB);
2145 }
2146
2147 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2148 "Invalid number of incoming values");
2149 return NewPhi;
2150 }
2151
2152 case Instruction::ExtractElement: {
2153 if (CanReuseExtract(E->Scalars)) {
2154 Value *V = VL0->getOperand(0);
2155 E->VectorizedValue = V;
2156 return V;
2157 }
2158 return Gather(E->Scalars, VecTy);
2159 }
2160 case Instruction::ZExt:
2161 case Instruction::SExt:
2162 case Instruction::FPToUI:
2163 case Instruction::FPToSI:
2164 case Instruction::FPExt:
2165 case Instruction::PtrToInt:
2166 case Instruction::IntToPtr:
2167 case Instruction::SIToFP:
2168 case Instruction::UIToFP:
2169 case Instruction::Trunc:
2170 case Instruction::FPTrunc:
2171 case Instruction::BitCast: {
2172 ValueList INVL;
2173 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2174 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2175
2176 setInsertPointAfterBundle(E->Scalars);
2177
2178 Value *InVec = vectorizeTree(INVL);
2179
2180 if (Value *V = alreadyVectorized(E->Scalars))
2181 return V;
2182
2183 CastInst *CI = dyn_cast<CastInst>(VL0);
2184 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2185 E->VectorizedValue = V;
2186 ++NumVectorInstructions;
2187 return V;
2188 }
2189 case Instruction::FCmp:
2190 case Instruction::ICmp: {
2191 ValueList LHSV, RHSV;
2192 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2193 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2194 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2195 }
2196
2197 setInsertPointAfterBundle(E->Scalars);
2198
2199 Value *L = vectorizeTree(LHSV);
2200 Value *R = vectorizeTree(RHSV);
2201
2202 if (Value *V = alreadyVectorized(E->Scalars))
2203 return V;
2204
2205 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2206 Value *V;
2207 if (Opcode == Instruction::FCmp)
2208 V = Builder.CreateFCmp(P0, L, R);
2209 else
2210 V = Builder.CreateICmp(P0, L, R);
2211
2212 E->VectorizedValue = V;
2213 ++NumVectorInstructions;
2214 return V;
2215 }
2216 case Instruction::Select: {
2217 ValueList TrueVec, FalseVec, CondVec;
2218 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2219 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2220 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2221 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
2222 }
2223
2224 setInsertPointAfterBundle(E->Scalars);
2225
2226 Value *Cond = vectorizeTree(CondVec);
2227 Value *True = vectorizeTree(TrueVec);
2228 Value *False = vectorizeTree(FalseVec);
2229
2230 if (Value *V = alreadyVectorized(E->Scalars))
2231 return V;
2232
2233 Value *V = Builder.CreateSelect(Cond, True, False);
2234 E->VectorizedValue = V;
2235 ++NumVectorInstructions;
2236 return V;
2237 }
2238 case Instruction::Add:
2239 case Instruction::FAdd:
2240 case Instruction::Sub:
2241 case Instruction::FSub:
2242 case Instruction::Mul:
2243 case Instruction::FMul:
2244 case Instruction::UDiv:
2245 case Instruction::SDiv:
2246 case Instruction::FDiv:
2247 case Instruction::URem:
2248 case Instruction::SRem:
2249 case Instruction::FRem:
2250 case Instruction::Shl:
2251 case Instruction::LShr:
2252 case Instruction::AShr:
2253 case Instruction::And:
2254 case Instruction::Or:
2255 case Instruction::Xor: {
2256 ValueList LHSVL, RHSVL;
2257 if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2258 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2259 else
2260 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2261 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2262 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2263 }
2264
2265 setInsertPointAfterBundle(E->Scalars);
2266
2267 Value *LHS = vectorizeTree(LHSVL);
2268 Value *RHS = vectorizeTree(RHSVL);
2269
2270 if (LHS == RHS && isa<Instruction>(LHS)) {
2271 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2272 }
2273
2274 if (Value *V = alreadyVectorized(E->Scalars))
2275 return V;
2276
2277 BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2278 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2279 E->VectorizedValue = V;
2280 propagateIRFlags(E->VectorizedValue, E->Scalars);
2281 ++NumVectorInstructions;
2282
2283 if (Instruction *I = dyn_cast<Instruction>(V))
2284 return propagateMetadata(I, E->Scalars);
2285
2286 return V;
2287 }
2288 case Instruction::Load: {
2289 // Loads are inserted at the head of the tree because we don't want to
2290 // sink them all the way down past store instructions.
2291 setInsertPointAfterBundle(E->Scalars);
2292
2293 LoadInst *LI = cast<LoadInst>(VL0);
2294 Type *ScalarLoadTy = LI->getType();
2295 unsigned AS = LI->getPointerAddressSpace();
2296
2297 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2298 VecTy->getPointerTo(AS));
2299
2300 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2301 // ExternalUses list to make sure that an extract will be generated in the
2302 // future.
2303 if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2304 ExternalUses.push_back(
2305 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2306
2307 unsigned Alignment = LI->getAlignment();
2308 LI = Builder.CreateLoad(VecPtr);
2309 if (!Alignment) {
2310 Alignment = DL.getABITypeAlignment(ScalarLoadTy);
2311 }
2312 LI->setAlignment(Alignment);
2313 E->VectorizedValue = LI;
2314 ++NumVectorInstructions;
2315 return propagateMetadata(LI, E->Scalars);
2316 }
2317 case Instruction::Store: {
2318 StoreInst *SI = cast<StoreInst>(VL0);
2319 unsigned Alignment = SI->getAlignment();
2320 unsigned AS = SI->getPointerAddressSpace();
2321
2322 ValueList ValueOp;
2323 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2324 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
2325
2326 setInsertPointAfterBundle(E->Scalars);
2327
2328 Value *VecValue = vectorizeTree(ValueOp);
2329 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2330 VecTy->getPointerTo(AS));
2331 StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2332
2333 // The pointer operand uses an in-tree scalar so we add the new BitCast to
2334 // ExternalUses list to make sure that an extract will be generated in the
2335 // future.
2336 if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2337 ExternalUses.push_back(
2338 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2339
2340 if (!Alignment) {
2341 Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
2342 }
2343 S->setAlignment(Alignment);
2344 E->VectorizedValue = S;
2345 ++NumVectorInstructions;
2346 return propagateMetadata(S, E->Scalars);
2347 }
2348 case Instruction::GetElementPtr: {
2349 setInsertPointAfterBundle(E->Scalars);
2350
2351 ValueList Op0VL;
2352 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2353 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
2354
2355 Value *Op0 = vectorizeTree(Op0VL);
2356
2357 std::vector<Value *> OpVecs;
2358 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2359 ++j) {
2360 ValueList OpVL;
2361 for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2362 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
2363
2364 Value *OpVec = vectorizeTree(OpVL);
2365 OpVecs.push_back(OpVec);
2366 }
2367
2368 Value *V = Builder.CreateGEP(
2369 cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2370 E->VectorizedValue = V;
2371 ++NumVectorInstructions;
2372
2373 if (Instruction *I = dyn_cast<Instruction>(V))
2374 return propagateMetadata(I, E->Scalars);
2375
2376 return V;
2377 }
2378 case Instruction::Call: {
2379 CallInst *CI = cast<CallInst>(VL0);
2380 setInsertPointAfterBundle(E->Scalars);
2381 Function *FI;
2382 Intrinsic::ID IID = Intrinsic::not_intrinsic;
2383 Value *ScalarArg = nullptr;
2384 if (CI && (FI = CI->getCalledFunction())) {
2385 IID = (Intrinsic::ID) FI->getIntrinsicID();
2386 }
2387 std::vector<Value *> OpVecs;
2388 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2389 ValueList OpVL;
2390 // ctlz,cttz and powi are special intrinsics whose second argument is
2391 // a scalar. This argument should not be vectorized.
2392 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2393 CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2394 ScalarArg = CEI->getArgOperand(j);
2395 OpVecs.push_back(CEI->getArgOperand(j));
2396 continue;
2397 }
2398 for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2399 CallInst *CEI = cast<CallInst>(E->Scalars[i]);
2400 OpVL.push_back(CEI->getArgOperand(j));
2401 }
2402
2403 Value *OpVec = vectorizeTree(OpVL);
2404 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2405 OpVecs.push_back(OpVec);
2406 }
2407
2408 Module *M = F->getParent();
2409 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2410 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2411 Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2412 Value *V = Builder.CreateCall(CF, OpVecs);
2413
2414 // The scalar argument uses an in-tree scalar so we add the new vectorized
2415 // call to ExternalUses list to make sure that an extract will be
2416 // generated in the future.
2417 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2418 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2419
2420 E->VectorizedValue = V;
2421 ++NumVectorInstructions;
2422 return V;
2423 }
2424 case Instruction::ShuffleVector: {
2425 ValueList LHSVL, RHSVL;
2426 assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2427 reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2428 setInsertPointAfterBundle(E->Scalars);
2429
2430 Value *LHS = vectorizeTree(LHSVL);
2431 Value *RHS = vectorizeTree(RHSVL);
2432
2433 if (Value *V = alreadyVectorized(E->Scalars))
2434 return V;
2435
2436 // Create a vector of LHS op1 RHS
2437 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2438 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2439
2440 // Create a vector of LHS op2 RHS
2441 Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2442 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2443 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2444
2445 // Create shuffle to take alternate operations from the vector.
2446 // Also, gather up odd and even scalar ops to propagate IR flags to
2447 // each vector operation.
2448 ValueList OddScalars, EvenScalars;
2449 unsigned e = E->Scalars.size();
2450 SmallVector<Constant *, 8> Mask(e);
2451 for (unsigned i = 0; i < e; ++i) {
2452 if (i & 1) {
2453 Mask[i] = Builder.getInt32(e + i);
2454 OddScalars.push_back(E->Scalars[i]);
2455 } else {
2456 Mask[i] = Builder.getInt32(i);
2457 EvenScalars.push_back(E->Scalars[i]);
2458 }
2459 }
2460
2461 Value *ShuffleMask = ConstantVector::get(Mask);
2462 propagateIRFlags(V0, EvenScalars);
2463 propagateIRFlags(V1, OddScalars);
2464
2465 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2466 E->VectorizedValue = V;
2467 ++NumVectorInstructions;
2468 if (Instruction *I = dyn_cast<Instruction>(V))
2469 return propagateMetadata(I, E->Scalars);
2470
2471 return V;
2472 }
2473 default:
2474 llvm_unreachable("unknown inst");
2475 }
2476 return nullptr;
2477 }
2478
vectorizeTree()2479 Value *BoUpSLP::vectorizeTree() {
2480
2481 // All blocks must be scheduled before any instructions are inserted.
2482 for (auto &BSIter : BlocksSchedules) {
2483 scheduleBlock(BSIter.second.get());
2484 }
2485
2486 Builder.SetInsertPoint(F->getEntryBlock().begin());
2487 vectorizeTree(&VectorizableTree[0]);
2488
2489 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2490
2491 // Extract all of the elements with the external uses.
2492 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2493 it != e; ++it) {
2494 Value *Scalar = it->Scalar;
2495 llvm::User *User = it->User;
2496
2497 // Skip users that we already RAUW. This happens when one instruction
2498 // has multiple uses of the same value.
2499 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2500 Scalar->user_end())
2501 continue;
2502 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2503
2504 int Idx = ScalarToTreeEntry[Scalar];
2505 TreeEntry *E = &VectorizableTree[Idx];
2506 assert(!E->NeedToGather && "Extracting from a gather list");
2507
2508 Value *Vec = E->VectorizedValue;
2509 assert(Vec && "Can't find vectorizable value");
2510
2511 Value *Lane = Builder.getInt32(it->Lane);
2512 // Generate extracts for out-of-tree users.
2513 // Find the insertion point for the extractelement lane.
2514 if (isa<Instruction>(Vec)){
2515 if (PHINode *PH = dyn_cast<PHINode>(User)) {
2516 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2517 if (PH->getIncomingValue(i) == Scalar) {
2518 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2519 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2520 CSEBlocks.insert(PH->getIncomingBlock(i));
2521 PH->setOperand(i, Ex);
2522 }
2523 }
2524 } else {
2525 Builder.SetInsertPoint(cast<Instruction>(User));
2526 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2527 CSEBlocks.insert(cast<Instruction>(User)->getParent());
2528 User->replaceUsesOfWith(Scalar, Ex);
2529 }
2530 } else {
2531 Builder.SetInsertPoint(F->getEntryBlock().begin());
2532 Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2533 CSEBlocks.insert(&F->getEntryBlock());
2534 User->replaceUsesOfWith(Scalar, Ex);
2535 }
2536
2537 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2538 }
2539
2540 // For each vectorized value:
2541 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2542 TreeEntry *Entry = &VectorizableTree[EIdx];
2543
2544 // For each lane:
2545 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2546 Value *Scalar = Entry->Scalars[Lane];
2547 // No need to handle users of gathered values.
2548 if (Entry->NeedToGather)
2549 continue;
2550
2551 assert(Entry->VectorizedValue && "Can't find vectorizable value");
2552
2553 Type *Ty = Scalar->getType();
2554 if (!Ty->isVoidTy()) {
2555 #ifndef NDEBUG
2556 for (User *U : Scalar->users()) {
2557 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2558
2559 assert((ScalarToTreeEntry.count(U) ||
2560 // It is legal to replace users in the ignorelist by undef.
2561 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2562 UserIgnoreList.end())) &&
2563 "Replacing out-of-tree value with undef");
2564 }
2565 #endif
2566 Value *Undef = UndefValue::get(Ty);
2567 Scalar->replaceAllUsesWith(Undef);
2568 }
2569 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2570 eraseInstruction(cast<Instruction>(Scalar));
2571 }
2572 }
2573
2574 Builder.ClearInsertionPoint();
2575
2576 return VectorizableTree[0].VectorizedValue;
2577 }
2578
optimizeGatherSequence()2579 void BoUpSLP::optimizeGatherSequence() {
2580 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2581 << " gather sequences instructions.\n");
2582 // LICM InsertElementInst sequences.
2583 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2584 e = GatherSeq.end(); it != e; ++it) {
2585 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2586
2587 if (!Insert)
2588 continue;
2589
2590 // Check if this block is inside a loop.
2591 Loop *L = LI->getLoopFor(Insert->getParent());
2592 if (!L)
2593 continue;
2594
2595 // Check if it has a preheader.
2596 BasicBlock *PreHeader = L->getLoopPreheader();
2597 if (!PreHeader)
2598 continue;
2599
2600 // If the vector or the element that we insert into it are
2601 // instructions that are defined in this basic block then we can't
2602 // hoist this instruction.
2603 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2604 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2605 if (CurrVec && L->contains(CurrVec))
2606 continue;
2607 if (NewElem && L->contains(NewElem))
2608 continue;
2609
2610 // We can hoist this instruction. Move it to the pre-header.
2611 Insert->moveBefore(PreHeader->getTerminator());
2612 }
2613
2614 // Make a list of all reachable blocks in our CSE queue.
2615 SmallVector<const DomTreeNode *, 8> CSEWorkList;
2616 CSEWorkList.reserve(CSEBlocks.size());
2617 for (BasicBlock *BB : CSEBlocks)
2618 if (DomTreeNode *N = DT->getNode(BB)) {
2619 assert(DT->isReachableFromEntry(N));
2620 CSEWorkList.push_back(N);
2621 }
2622
2623 // Sort blocks by domination. This ensures we visit a block after all blocks
2624 // dominating it are visited.
2625 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2626 [this](const DomTreeNode *A, const DomTreeNode *B) {
2627 return DT->properlyDominates(A, B);
2628 });
2629
2630 // Perform O(N^2) search over the gather sequences and merge identical
2631 // instructions. TODO: We can further optimize this scan if we split the
2632 // instructions into different buckets based on the insert lane.
2633 SmallVector<Instruction *, 16> Visited;
2634 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2635 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2636 "Worklist not sorted properly!");
2637 BasicBlock *BB = (*I)->getBlock();
2638 // For all instructions in blocks containing gather sequences:
2639 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2640 Instruction *In = it++;
2641 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2642 continue;
2643
2644 // Check if we can replace this instruction with any of the
2645 // visited instructions.
2646 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2647 ve = Visited.end();
2648 v != ve; ++v) {
2649 if (In->isIdenticalTo(*v) &&
2650 DT->dominates((*v)->getParent(), In->getParent())) {
2651 In->replaceAllUsesWith(*v);
2652 eraseInstruction(In);
2653 In = nullptr;
2654 break;
2655 }
2656 }
2657 if (In) {
2658 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2659 Visited.push_back(In);
2660 }
2661 }
2662 }
2663 CSEBlocks.clear();
2664 GatherSeq.clear();
2665 }
2666
2667 // Groups the instructions to a bundle (which is then a single scheduling entity)
2668 // and schedules instructions until the bundle gets ready.
tryScheduleBundle(ArrayRef<Value * > VL,BoUpSLP * SLP)2669 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2670 BoUpSLP *SLP) {
2671 if (isa<PHINode>(VL[0]))
2672 return true;
2673
2674 // Initialize the instruction bundle.
2675 Instruction *OldScheduleEnd = ScheduleEnd;
2676 ScheduleData *PrevInBundle = nullptr;
2677 ScheduleData *Bundle = nullptr;
2678 bool ReSchedule = false;
2679 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
2680 for (Value *V : VL) {
2681 extendSchedulingRegion(V);
2682 ScheduleData *BundleMember = getScheduleData(V);
2683 assert(BundleMember &&
2684 "no ScheduleData for bundle member (maybe not in same basic block)");
2685 if (BundleMember->IsScheduled) {
2686 // A bundle member was scheduled as single instruction before and now
2687 // needs to be scheduled as part of the bundle. We just get rid of the
2688 // existing schedule.
2689 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
2690 << " was already scheduled\n");
2691 ReSchedule = true;
2692 }
2693 assert(BundleMember->isSchedulingEntity() &&
2694 "bundle member already part of other bundle");
2695 if (PrevInBundle) {
2696 PrevInBundle->NextInBundle = BundleMember;
2697 } else {
2698 Bundle = BundleMember;
2699 }
2700 BundleMember->UnscheduledDepsInBundle = 0;
2701 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2702
2703 // Group the instructions to a bundle.
2704 BundleMember->FirstInBundle = Bundle;
2705 PrevInBundle = BundleMember;
2706 }
2707 if (ScheduleEnd != OldScheduleEnd) {
2708 // The scheduling region got new instructions at the lower end (or it is a
2709 // new region for the first bundle). This makes it necessary to
2710 // recalculate all dependencies.
2711 // It is seldom that this needs to be done a second time after adding the
2712 // initial bundle to the region.
2713 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2714 ScheduleData *SD = getScheduleData(I);
2715 SD->clearDependencies();
2716 }
2717 ReSchedule = true;
2718 }
2719 if (ReSchedule) {
2720 resetSchedule();
2721 initialFillReadyList(ReadyInsts);
2722 }
2723
2724 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2725 << BB->getName() << "\n");
2726
2727 calculateDependencies(Bundle, true, SLP);
2728
2729 // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2730 // means that there are no cyclic dependencies and we can schedule it.
2731 // Note that's important that we don't "schedule" the bundle yet (see
2732 // cancelScheduling).
2733 while (!Bundle->isReady() && !ReadyInsts.empty()) {
2734
2735 ScheduleData *pickedSD = ReadyInsts.back();
2736 ReadyInsts.pop_back();
2737
2738 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2739 schedule(pickedSD, ReadyInsts);
2740 }
2741 }
2742 return Bundle->isReady();
2743 }
2744
cancelScheduling(ArrayRef<Value * > VL)2745 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2746 if (isa<PHINode>(VL[0]))
2747 return;
2748
2749 ScheduleData *Bundle = getScheduleData(VL[0]);
2750 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
2751 assert(!Bundle->IsScheduled &&
2752 "Can't cancel bundle which is already scheduled");
2753 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2754 "tried to unbundle something which is not a bundle");
2755
2756 // Un-bundle: make single instructions out of the bundle.
2757 ScheduleData *BundleMember = Bundle;
2758 while (BundleMember) {
2759 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2760 BundleMember->FirstInBundle = BundleMember;
2761 ScheduleData *Next = BundleMember->NextInBundle;
2762 BundleMember->NextInBundle = nullptr;
2763 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2764 if (BundleMember->UnscheduledDepsInBundle == 0) {
2765 ReadyInsts.insert(BundleMember);
2766 }
2767 BundleMember = Next;
2768 }
2769 }
2770
extendSchedulingRegion(Value * V)2771 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2772 if (getScheduleData(V))
2773 return;
2774 Instruction *I = dyn_cast<Instruction>(V);
2775 assert(I && "bundle member must be an instruction");
2776 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2777 if (!ScheduleStart) {
2778 // It's the first instruction in the new region.
2779 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2780 ScheduleStart = I;
2781 ScheduleEnd = I->getNextNode();
2782 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2783 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
2784 return;
2785 }
2786 // Search up and down at the same time, because we don't know if the new
2787 // instruction is above or below the existing scheduling region.
2788 BasicBlock::reverse_iterator UpIter(ScheduleStart);
2789 BasicBlock::reverse_iterator UpperEnd = BB->rend();
2790 BasicBlock::iterator DownIter(ScheduleEnd);
2791 BasicBlock::iterator LowerEnd = BB->end();
2792 for (;;) {
2793 if (UpIter != UpperEnd) {
2794 if (&*UpIter == I) {
2795 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2796 ScheduleStart = I;
2797 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
2798 return;
2799 }
2800 UpIter++;
2801 }
2802 if (DownIter != LowerEnd) {
2803 if (&*DownIter == I) {
2804 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2805 nullptr);
2806 ScheduleEnd = I->getNextNode();
2807 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2808 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
2809 return;
2810 }
2811 DownIter++;
2812 }
2813 assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2814 "instruction not found in block");
2815 }
2816 }
2817
initScheduleData(Instruction * FromI,Instruction * ToI,ScheduleData * PrevLoadStore,ScheduleData * NextLoadStore)2818 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2819 Instruction *ToI,
2820 ScheduleData *PrevLoadStore,
2821 ScheduleData *NextLoadStore) {
2822 ScheduleData *CurrentLoadStore = PrevLoadStore;
2823 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2824 ScheduleData *SD = ScheduleDataMap[I];
2825 if (!SD) {
2826 // Allocate a new ScheduleData for the instruction.
2827 if (ChunkPos >= ChunkSize) {
2828 ScheduleDataChunks.push_back(
2829 llvm::make_unique<ScheduleData[]>(ChunkSize));
2830 ChunkPos = 0;
2831 }
2832 SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2833 ScheduleDataMap[I] = SD;
2834 SD->Inst = I;
2835 }
2836 assert(!isInSchedulingRegion(SD) &&
2837 "new ScheduleData already in scheduling region");
2838 SD->init(SchedulingRegionID);
2839
2840 if (I->mayReadOrWriteMemory()) {
2841 // Update the linked list of memory accessing instructions.
2842 if (CurrentLoadStore) {
2843 CurrentLoadStore->NextLoadStore = SD;
2844 } else {
2845 FirstLoadStoreInRegion = SD;
2846 }
2847 CurrentLoadStore = SD;
2848 }
2849 }
2850 if (NextLoadStore) {
2851 if (CurrentLoadStore)
2852 CurrentLoadStore->NextLoadStore = NextLoadStore;
2853 } else {
2854 LastLoadStoreInRegion = CurrentLoadStore;
2855 }
2856 }
2857
calculateDependencies(ScheduleData * SD,bool InsertInReadyList,BoUpSLP * SLP)2858 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2859 bool InsertInReadyList,
2860 BoUpSLP *SLP) {
2861 assert(SD->isSchedulingEntity());
2862
2863 SmallVector<ScheduleData *, 10> WorkList;
2864 WorkList.push_back(SD);
2865
2866 while (!WorkList.empty()) {
2867 ScheduleData *SD = WorkList.back();
2868 WorkList.pop_back();
2869
2870 ScheduleData *BundleMember = SD;
2871 while (BundleMember) {
2872 assert(isInSchedulingRegion(BundleMember));
2873 if (!BundleMember->hasValidDependencies()) {
2874
2875 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
2876 BundleMember->Dependencies = 0;
2877 BundleMember->resetUnscheduledDeps();
2878
2879 // Handle def-use chain dependencies.
2880 for (User *U : BundleMember->Inst->users()) {
2881 if (isa<Instruction>(U)) {
2882 ScheduleData *UseSD = getScheduleData(U);
2883 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2884 BundleMember->Dependencies++;
2885 ScheduleData *DestBundle = UseSD->FirstInBundle;
2886 if (!DestBundle->IsScheduled) {
2887 BundleMember->incrementUnscheduledDeps(1);
2888 }
2889 if (!DestBundle->hasValidDependencies()) {
2890 WorkList.push_back(DestBundle);
2891 }
2892 }
2893 } else {
2894 // I'm not sure if this can ever happen. But we need to be safe.
2895 // This lets the instruction/bundle never be scheduled and eventally
2896 // disable vectorization.
2897 BundleMember->Dependencies++;
2898 BundleMember->incrementUnscheduledDeps(1);
2899 }
2900 }
2901
2902 // Handle the memory dependencies.
2903 ScheduleData *DepDest = BundleMember->NextLoadStore;
2904 if (DepDest) {
2905 Instruction *SrcInst = BundleMember->Inst;
2906 AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
2907 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2908 unsigned numAliased = 0;
2909 unsigned DistToSrc = 1;
2910
2911 while (DepDest) {
2912 assert(isInSchedulingRegion(DepDest));
2913
2914 // We have two limits to reduce the complexity:
2915 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
2916 // SLP->isAliased (which is the expensive part in this loop).
2917 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
2918 // the whole loop (even if the loop is fast, it's quadratic).
2919 // It's important for the loop break condition (see below) to
2920 // check this limit even between two read-only instructions.
2921 if (DistToSrc >= MaxMemDepDistance ||
2922 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
2923 (numAliased >= AliasedCheckLimit ||
2924 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
2925
2926 // We increment the counter only if the locations are aliased
2927 // (instead of counting all alias checks). This gives a better
2928 // balance between reduced runtime and accurate dependencies.
2929 numAliased++;
2930
2931 DepDest->MemoryDependencies.push_back(BundleMember);
2932 BundleMember->Dependencies++;
2933 ScheduleData *DestBundle = DepDest->FirstInBundle;
2934 if (!DestBundle->IsScheduled) {
2935 BundleMember->incrementUnscheduledDeps(1);
2936 }
2937 if (!DestBundle->hasValidDependencies()) {
2938 WorkList.push_back(DestBundle);
2939 }
2940 }
2941 DepDest = DepDest->NextLoadStore;
2942
2943 // Example, explaining the loop break condition: Let's assume our
2944 // starting instruction is i0 and MaxMemDepDistance = 3.
2945 //
2946 // +--------v--v--v
2947 // i0,i1,i2,i3,i4,i5,i6,i7,i8
2948 // +--------^--^--^
2949 //
2950 // MaxMemDepDistance let us stop alias-checking at i3 and we add
2951 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
2952 // Previously we already added dependencies from i3 to i6,i7,i8
2953 // (because of MaxMemDepDistance). As we added a dependency from
2954 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
2955 // and we can abort this loop at i6.
2956 if (DistToSrc >= 2 * MaxMemDepDistance)
2957 break;
2958 DistToSrc++;
2959 }
2960 }
2961 }
2962 BundleMember = BundleMember->NextInBundle;
2963 }
2964 if (InsertInReadyList && SD->isReady()) {
2965 ReadyInsts.push_back(SD);
2966 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
2967 }
2968 }
2969 }
2970
resetSchedule()2971 void BoUpSLP::BlockScheduling::resetSchedule() {
2972 assert(ScheduleStart &&
2973 "tried to reset schedule on block which has not been scheduled");
2974 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2975 ScheduleData *SD = getScheduleData(I);
2976 assert(isInSchedulingRegion(SD));
2977 SD->IsScheduled = false;
2978 SD->resetUnscheduledDeps();
2979 }
2980 ReadyInsts.clear();
2981 }
2982
scheduleBlock(BlockScheduling * BS)2983 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2984
2985 if (!BS->ScheduleStart)
2986 return;
2987
2988 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2989
2990 BS->resetSchedule();
2991
2992 // For the real scheduling we use a more sophisticated ready-list: it is
2993 // sorted by the original instruction location. This lets the final schedule
2994 // be as close as possible to the original instruction order.
2995 struct ScheduleDataCompare {
2996 bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
2997 return SD2->SchedulingPriority < SD1->SchedulingPriority;
2998 }
2999 };
3000 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3001
3002 // Ensure that all depencency data is updated and fill the ready-list with
3003 // initial instructions.
3004 int Idx = 0;
3005 int NumToSchedule = 0;
3006 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3007 I = I->getNextNode()) {
3008 ScheduleData *SD = BS->getScheduleData(I);
3009 assert(
3010 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3011 "scheduler and vectorizer have different opinion on what is a bundle");
3012 SD->FirstInBundle->SchedulingPriority = Idx++;
3013 if (SD->isSchedulingEntity()) {
3014 BS->calculateDependencies(SD, false, this);
3015 NumToSchedule++;
3016 }
3017 }
3018 BS->initialFillReadyList(ReadyInsts);
3019
3020 Instruction *LastScheduledInst = BS->ScheduleEnd;
3021
3022 // Do the "real" scheduling.
3023 while (!ReadyInsts.empty()) {
3024 ScheduleData *picked = *ReadyInsts.begin();
3025 ReadyInsts.erase(ReadyInsts.begin());
3026
3027 // Move the scheduled instruction(s) to their dedicated places, if not
3028 // there yet.
3029 ScheduleData *BundleMember = picked;
3030 while (BundleMember) {
3031 Instruction *pickedInst = BundleMember->Inst;
3032 if (LastScheduledInst->getNextNode() != pickedInst) {
3033 BS->BB->getInstList().remove(pickedInst);
3034 BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
3035 }
3036 LastScheduledInst = pickedInst;
3037 BundleMember = BundleMember->NextInBundle;
3038 }
3039
3040 BS->schedule(picked, ReadyInsts);
3041 NumToSchedule--;
3042 }
3043 assert(NumToSchedule == 0 && "could not schedule all instructions");
3044
3045 // Avoid duplicate scheduling of the block.
3046 BS->ScheduleStart = nullptr;
3047 }
3048
3049 /// The SLPVectorizer Pass.
3050 struct SLPVectorizer : public FunctionPass {
3051 typedef SmallVector<StoreInst *, 8> StoreList;
3052 typedef MapVector<Value *, StoreList> StoreListMap;
3053
3054 /// Pass identification, replacement for typeid
3055 static char ID;
3056
SLPVectorizer__anonf9942ad60111::SLPVectorizer3057 explicit SLPVectorizer() : FunctionPass(ID) {
3058 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
3059 }
3060
3061 ScalarEvolution *SE;
3062 TargetTransformInfo *TTI;
3063 TargetLibraryInfo *TLI;
3064 AliasAnalysis *AA;
3065 LoopInfo *LI;
3066 DominatorTree *DT;
3067 AssumptionCache *AC;
3068
runOnFunction__anonf9942ad60111::SLPVectorizer3069 bool runOnFunction(Function &F) override {
3070 if (skipOptnoneFunction(F))
3071 return false;
3072
3073 SE = &getAnalysis<ScalarEvolution>();
3074 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3075 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3076 TLI = TLIP ? &TLIP->getTLI() : nullptr;
3077 AA = &getAnalysis<AliasAnalysis>();
3078 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3079 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3080 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3081
3082 StoreRefs.clear();
3083 bool Changed = false;
3084
3085 // If the target claims to have no vector registers don't attempt
3086 // vectorization.
3087 if (!TTI->getNumberOfRegisters(true))
3088 return false;
3089
3090 // Don't vectorize when the attribute NoImplicitFloat is used.
3091 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3092 return false;
3093
3094 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3095
3096 // Use the bottom up slp vectorizer to construct chains that start with
3097 // store instructions.
3098 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
3099
3100 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3101 // delete instructions.
3102
3103 // Scan the blocks in the function in post order.
3104 for (auto BB : post_order(&F.getEntryBlock())) {
3105 // Vectorize trees that end at stores.
3106 if (unsigned count = collectStores(BB, R)) {
3107 (void)count;
3108 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
3109 Changed |= vectorizeStoreChains(R);
3110 }
3111
3112 // Vectorize trees that end at reductions.
3113 Changed |= vectorizeChainsInBlock(BB, R);
3114 }
3115
3116 if (Changed) {
3117 R.optimizeGatherSequence();
3118 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3119 DEBUG(verifyFunction(F));
3120 }
3121 return Changed;
3122 }
3123
getAnalysisUsage__anonf9942ad60111::SLPVectorizer3124 void getAnalysisUsage(AnalysisUsage &AU) const override {
3125 FunctionPass::getAnalysisUsage(AU);
3126 AU.addRequired<AssumptionCacheTracker>();
3127 AU.addRequired<ScalarEvolution>();
3128 AU.addRequired<AliasAnalysis>();
3129 AU.addRequired<TargetTransformInfoWrapperPass>();
3130 AU.addRequired<LoopInfoWrapperPass>();
3131 AU.addRequired<DominatorTreeWrapperPass>();
3132 AU.addPreserved<LoopInfoWrapperPass>();
3133 AU.addPreserved<DominatorTreeWrapperPass>();
3134 AU.setPreservesCFG();
3135 }
3136
3137 private:
3138
3139 /// \brief Collect memory references and sort them according to their base
3140 /// object. We sort the stores to their base objects to reduce the cost of the
3141 /// quadratic search on the stores. TODO: We can further reduce this cost
3142 /// if we flush the chain creation every time we run into a memory barrier.
3143 unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
3144
3145 /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
3146 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
3147
3148 /// \brief Try to vectorize a list of operands.
3149 /// \@param BuildVector A list of users to ignore for the purpose of
3150 /// scheduling and that don't need extracting.
3151 /// \returns true if a value was vectorized.
3152 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3153 ArrayRef<Value *> BuildVector = None,
3154 bool allowReorder = false);
3155
3156 /// \brief Try to vectorize a chain that may start at the operands of \V;
3157 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
3158
3159 /// \brief Vectorize the stores that were collected in StoreRefs.
3160 bool vectorizeStoreChains(BoUpSLP &R);
3161
3162 /// \brief Scan the basic block and look for patterns that are likely to start
3163 /// a vectorization chain.
3164 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3165
3166 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3167 BoUpSLP &R);
3168
3169 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3170 BoUpSLP &R);
3171 private:
3172 StoreListMap StoreRefs;
3173 };
3174
3175 /// \brief Check that the Values in the slice in VL array are still existent in
3176 /// the WeakVH array.
3177 /// Vectorization of part of the VL array may cause later values in the VL array
3178 /// to become invalid. We track when this has happened in the WeakVH array.
hasValueBeenRAUWed(ArrayRef<Value * > VL,ArrayRef<WeakVH> VH,unsigned SliceBegin,unsigned SliceSize)3179 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
3180 unsigned SliceBegin, unsigned SliceSize) {
3181 VL = VL.slice(SliceBegin, SliceSize);
3182 VH = VH.slice(SliceBegin, SliceSize);
3183 return !std::equal(VL.begin(), VL.end(), VH.begin());
3184 }
3185
vectorizeStoreChain(ArrayRef<Value * > Chain,int CostThreshold,BoUpSLP & R)3186 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3187 int CostThreshold, BoUpSLP &R) {
3188 unsigned ChainLen = Chain.size();
3189 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3190 << "\n");
3191 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
3192 auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
3193 unsigned Sz = DL.getTypeSizeInBits(StoreTy);
3194 unsigned VF = MinVecRegSize / Sz;
3195
3196 if (!isPowerOf2_32(Sz) || VF < 2)
3197 return false;
3198
3199 // Keep track of values that were deleted by vectorizing in the loop below.
3200 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3201
3202 bool Changed = false;
3203 // Look for profitable vectorizable trees at all offsets, starting at zero.
3204 for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3205 if (i + VF > e)
3206 break;
3207
3208 // Check that a previous iteration of this loop did not delete the Value.
3209 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3210 continue;
3211
3212 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3213 << "\n");
3214 ArrayRef<Value *> Operands = Chain.slice(i, VF);
3215
3216 R.buildTree(Operands);
3217
3218 int Cost = R.getTreeCost();
3219
3220 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3221 if (Cost < CostThreshold) {
3222 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3223 R.vectorizeTree();
3224
3225 // Move to the next bundle.
3226 i += VF - 1;
3227 Changed = true;
3228 }
3229 }
3230
3231 return Changed;
3232 }
3233
vectorizeStores(ArrayRef<StoreInst * > Stores,int costThreshold,BoUpSLP & R)3234 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3235 int costThreshold, BoUpSLP &R) {
3236 SetVector<StoreInst *> Heads, Tails;
3237 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3238
3239 // We may run into multiple chains that merge into a single chain. We mark the
3240 // stores that we vectorized so that we don't visit the same store twice.
3241 BoUpSLP::ValueSet VectorizedStores;
3242 bool Changed = false;
3243
3244 // Do a quadratic search on all of the given stores and find
3245 // all of the pairs of stores that follow each other.
3246 for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3247 for (unsigned j = 0; j < e; ++j) {
3248 if (i == j)
3249 continue;
3250 const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
3251 if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
3252 Tails.insert(Stores[j]);
3253 Heads.insert(Stores[i]);
3254 ConsecutiveChain[Stores[i]] = Stores[j];
3255 }
3256 }
3257 }
3258
3259 // For stores that start but don't end a link in the chain:
3260 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3261 it != e; ++it) {
3262 if (Tails.count(*it))
3263 continue;
3264
3265 // We found a store instr that starts a chain. Now follow the chain and try
3266 // to vectorize it.
3267 BoUpSLP::ValueList Operands;
3268 StoreInst *I = *it;
3269 // Collect the chain into a list.
3270 while (Tails.count(I) || Heads.count(I)) {
3271 if (VectorizedStores.count(I))
3272 break;
3273 Operands.push_back(I);
3274 // Move to the next value in the chain.
3275 I = ConsecutiveChain[I];
3276 }
3277
3278 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
3279
3280 // Mark the vectorized stores so that we don't vectorize them again.
3281 if (Vectorized)
3282 VectorizedStores.insert(Operands.begin(), Operands.end());
3283 Changed |= Vectorized;
3284 }
3285
3286 return Changed;
3287 }
3288
3289
collectStores(BasicBlock * BB,BoUpSLP & R)3290 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3291 unsigned count = 0;
3292 StoreRefs.clear();
3293 const DataLayout &DL = BB->getModule()->getDataLayout();
3294 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3295 StoreInst *SI = dyn_cast<StoreInst>(it);
3296 if (!SI)
3297 continue;
3298
3299 // Don't touch volatile stores.
3300 if (!SI->isSimple())
3301 continue;
3302
3303 // Check that the pointer points to scalars.
3304 Type *Ty = SI->getValueOperand()->getType();
3305 if (!isValidElementType(Ty))
3306 continue;
3307
3308 // Find the base pointer.
3309 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3310
3311 // Save the store locations.
3312 StoreRefs[Ptr].push_back(SI);
3313 count++;
3314 }
3315 return count;
3316 }
3317
tryToVectorizePair(Value * A,Value * B,BoUpSLP & R)3318 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3319 if (!A || !B)
3320 return false;
3321 Value *VL[] = { A, B };
3322 return tryToVectorizeList(VL, R, None, true);
3323 }
3324
tryToVectorizeList(ArrayRef<Value * > VL,BoUpSLP & R,ArrayRef<Value * > BuildVector,bool allowReorder)3325 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3326 ArrayRef<Value *> BuildVector,
3327 bool allowReorder) {
3328 if (VL.size() < 2)
3329 return false;
3330
3331 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3332
3333 // Check that all of the parts are scalar instructions of the same type.
3334 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3335 if (!I0)
3336 return false;
3337
3338 unsigned Opcode0 = I0->getOpcode();
3339 const DataLayout &DL = I0->getModule()->getDataLayout();
3340
3341 Type *Ty0 = I0->getType();
3342 unsigned Sz = DL.getTypeSizeInBits(Ty0);
3343 unsigned VF = MinVecRegSize / Sz;
3344
3345 for (int i = 0, e = VL.size(); i < e; ++i) {
3346 Type *Ty = VL[i]->getType();
3347 if (!isValidElementType(Ty))
3348 return false;
3349 Instruction *Inst = dyn_cast<Instruction>(VL[i]);
3350 if (!Inst || Inst->getOpcode() != Opcode0)
3351 return false;
3352 }
3353
3354 bool Changed = false;
3355
3356 // Keep track of values that were deleted by vectorizing in the loop below.
3357 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3358
3359 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3360 unsigned OpsWidth = 0;
3361
3362 if (i + VF > e)
3363 OpsWidth = e - i;
3364 else
3365 OpsWidth = VF;
3366
3367 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3368 break;
3369
3370 // Check that a previous iteration of this loop did not delete the Value.
3371 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3372 continue;
3373
3374 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3375 << "\n");
3376 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3377
3378 ArrayRef<Value *> BuildVectorSlice;
3379 if (!BuildVector.empty())
3380 BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3381
3382 R.buildTree(Ops, BuildVectorSlice);
3383 // TODO: check if we can allow reordering also for other cases than
3384 // tryToVectorizePair()
3385 if (allowReorder && R.shouldReorder()) {
3386 assert(Ops.size() == 2);
3387 assert(BuildVectorSlice.empty());
3388 Value *ReorderedOps[] = { Ops[1], Ops[0] };
3389 R.buildTree(ReorderedOps, None);
3390 }
3391 int Cost = R.getTreeCost();
3392
3393 if (Cost < -SLPCostThreshold) {
3394 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3395 Value *VectorizedRoot = R.vectorizeTree();
3396
3397 // Reconstruct the build vector by extracting the vectorized root. This
3398 // way we handle the case where some elements of the vector are undefined.
3399 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3400 if (!BuildVectorSlice.empty()) {
3401 // The insert point is the last build vector instruction. The vectorized
3402 // root will precede it. This guarantees that we get an instruction. The
3403 // vectorized tree could have been constant folded.
3404 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3405 unsigned VecIdx = 0;
3406 for (auto &V : BuildVectorSlice) {
3407 IRBuilder<true, NoFolder> Builder(
3408 ++BasicBlock::iterator(InsertAfter));
3409 InsertElementInst *IE = cast<InsertElementInst>(V);
3410 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3411 VectorizedRoot, Builder.getInt32(VecIdx++)));
3412 IE->setOperand(1, Extract);
3413 IE->removeFromParent();
3414 IE->insertAfter(Extract);
3415 InsertAfter = IE;
3416 }
3417 }
3418 // Move to the next bundle.
3419 i += VF - 1;
3420 Changed = true;
3421 }
3422 }
3423
3424 return Changed;
3425 }
3426
tryToVectorize(BinaryOperator * V,BoUpSLP & R)3427 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3428 if (!V)
3429 return false;
3430
3431 // Try to vectorize V.
3432 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3433 return true;
3434
3435 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3436 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3437 // Try to skip B.
3438 if (B && B->hasOneUse()) {
3439 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3440 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3441 if (tryToVectorizePair(A, B0, R)) {
3442 return true;
3443 }
3444 if (tryToVectorizePair(A, B1, R)) {
3445 return true;
3446 }
3447 }
3448
3449 // Try to skip A.
3450 if (A && A->hasOneUse()) {
3451 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3452 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3453 if (tryToVectorizePair(A0, B, R)) {
3454 return true;
3455 }
3456 if (tryToVectorizePair(A1, B, R)) {
3457 return true;
3458 }
3459 }
3460 return 0;
3461 }
3462
3463 /// \brief Generate a shuffle mask to be used in a reduction tree.
3464 ///
3465 /// \param VecLen The length of the vector to be reduced.
3466 /// \param NumEltsToRdx The number of elements that should be reduced in the
3467 /// vector.
3468 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3469 /// reduction. A pairwise reduction will generate a mask of
3470 /// <0,2,...> or <1,3,..> while a splitting reduction will generate
3471 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3472 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
createRdxShuffleMask(unsigned VecLen,unsigned NumEltsToRdx,bool IsPairwise,bool IsLeft,IRBuilder<> & Builder)3473 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3474 bool IsPairwise, bool IsLeft,
3475 IRBuilder<> &Builder) {
3476 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3477
3478 SmallVector<Constant *, 32> ShuffleMask(
3479 VecLen, UndefValue::get(Builder.getInt32Ty()));
3480
3481 if (IsPairwise)
3482 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3483 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3484 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3485 else
3486 // Move the upper half of the vector to the lower half.
3487 for (unsigned i = 0; i != NumEltsToRdx; ++i)
3488 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3489
3490 return ConstantVector::get(ShuffleMask);
3491 }
3492
3493
3494 /// Model horizontal reductions.
3495 ///
3496 /// A horizontal reduction is a tree of reduction operations (currently add and
3497 /// fadd) that has operations that can be put into a vector as its leaf.
3498 /// For example, this tree:
3499 ///
3500 /// mul mul mul mul
3501 /// \ / \ /
3502 /// + +
3503 /// \ /
3504 /// +
3505 /// This tree has "mul" as its reduced values and "+" as its reduction
3506 /// operations. A reduction might be feeding into a store or a binary operation
3507 /// feeding a phi.
3508 /// ...
3509 /// \ /
3510 /// +
3511 /// |
3512 /// phi +=
3513 ///
3514 /// Or:
3515 /// ...
3516 /// \ /
3517 /// +
3518 /// |
3519 /// *p =
3520 ///
3521 class HorizontalReduction {
3522 SmallVector<Value *, 16> ReductionOps;
3523 SmallVector<Value *, 32> ReducedVals;
3524
3525 BinaryOperator *ReductionRoot;
3526 PHINode *ReductionPHI;
3527
3528 /// The opcode of the reduction.
3529 unsigned ReductionOpcode;
3530 /// The opcode of the values we perform a reduction on.
3531 unsigned ReducedValueOpcode;
3532 /// The width of one full horizontal reduction operation.
3533 unsigned ReduxWidth;
3534 /// Should we model this reduction as a pairwise reduction tree or a tree that
3535 /// splits the vector in halves and adds those halves.
3536 bool IsPairwiseReduction;
3537
3538 public:
HorizontalReduction()3539 HorizontalReduction()
3540 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3541 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3542
3543 /// \brief Try to find a reduction tree.
matchAssociativeReduction(PHINode * Phi,BinaryOperator * B)3544 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
3545 assert((!Phi ||
3546 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3547 "Thi phi needs to use the binary operator");
3548
3549 // We could have a initial reductions that is not an add.
3550 // r *= v1 + v2 + v3 + v4
3551 // In such a case start looking for a tree rooted in the first '+'.
3552 if (Phi) {
3553 if (B->getOperand(0) == Phi) {
3554 Phi = nullptr;
3555 B = dyn_cast<BinaryOperator>(B->getOperand(1));
3556 } else if (B->getOperand(1) == Phi) {
3557 Phi = nullptr;
3558 B = dyn_cast<BinaryOperator>(B->getOperand(0));
3559 }
3560 }
3561
3562 if (!B)
3563 return false;
3564
3565 Type *Ty = B->getType();
3566 if (!isValidElementType(Ty))
3567 return false;
3568
3569 const DataLayout &DL = B->getModule()->getDataLayout();
3570 ReductionOpcode = B->getOpcode();
3571 ReducedValueOpcode = 0;
3572 ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
3573 ReductionRoot = B;
3574 ReductionPHI = Phi;
3575
3576 if (ReduxWidth < 4)
3577 return false;
3578
3579 // We currently only support adds.
3580 if (ReductionOpcode != Instruction::Add &&
3581 ReductionOpcode != Instruction::FAdd)
3582 return false;
3583
3584 // Post order traverse the reduction tree starting at B. We only handle true
3585 // trees containing only binary operators.
3586 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3587 Stack.push_back(std::make_pair(B, 0));
3588 while (!Stack.empty()) {
3589 BinaryOperator *TreeN = Stack.back().first;
3590 unsigned EdgeToVist = Stack.back().second++;
3591 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3592
3593 // Only handle trees in the current basic block.
3594 if (TreeN->getParent() != B->getParent())
3595 return false;
3596
3597 // Each tree node needs to have one user except for the ultimate
3598 // reduction.
3599 if (!TreeN->hasOneUse() && TreeN != B)
3600 return false;
3601
3602 // Postorder vist.
3603 if (EdgeToVist == 2 || IsReducedValue) {
3604 if (IsReducedValue) {
3605 // Make sure that the opcodes of the operations that we are going to
3606 // reduce match.
3607 if (!ReducedValueOpcode)
3608 ReducedValueOpcode = TreeN->getOpcode();
3609 else if (ReducedValueOpcode != TreeN->getOpcode())
3610 return false;
3611 ReducedVals.push_back(TreeN);
3612 } else {
3613 // We need to be able to reassociate the adds.
3614 if (!TreeN->isAssociative())
3615 return false;
3616 ReductionOps.push_back(TreeN);
3617 }
3618 // Retract.
3619 Stack.pop_back();
3620 continue;
3621 }
3622
3623 // Visit left or right.
3624 Value *NextV = TreeN->getOperand(EdgeToVist);
3625 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3626 if (Next)
3627 Stack.push_back(std::make_pair(Next, 0));
3628 else if (NextV != Phi)
3629 return false;
3630 }
3631 return true;
3632 }
3633
3634 /// \brief Attempt to vectorize the tree found by
3635 /// matchAssociativeReduction.
tryToReduce(BoUpSLP & V,TargetTransformInfo * TTI)3636 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3637 if (ReducedVals.empty())
3638 return false;
3639
3640 unsigned NumReducedVals = ReducedVals.size();
3641 if (NumReducedVals < ReduxWidth)
3642 return false;
3643
3644 Value *VectorizedTree = nullptr;
3645 IRBuilder<> Builder(ReductionRoot);
3646 FastMathFlags Unsafe;
3647 Unsafe.setUnsafeAlgebra();
3648 Builder.SetFastMathFlags(Unsafe);
3649 unsigned i = 0;
3650
3651 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3652 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3653
3654 // Estimate cost.
3655 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3656 if (Cost >= -SLPCostThreshold)
3657 break;
3658
3659 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3660 << ". (HorRdx)\n");
3661
3662 // Vectorize a tree.
3663 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3664 Value *VectorizedRoot = V.vectorizeTree();
3665
3666 // Emit a reduction.
3667 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3668 if (VectorizedTree) {
3669 Builder.SetCurrentDebugLocation(Loc);
3670 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3671 ReducedSubTree, "bin.rdx");
3672 } else
3673 VectorizedTree = ReducedSubTree;
3674 }
3675
3676 if (VectorizedTree) {
3677 // Finish the reduction.
3678 for (; i < NumReducedVals; ++i) {
3679 Builder.SetCurrentDebugLocation(
3680 cast<Instruction>(ReducedVals[i])->getDebugLoc());
3681 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3682 ReducedVals[i]);
3683 }
3684 // Update users.
3685 if (ReductionPHI) {
3686 assert(ReductionRoot && "Need a reduction operation");
3687 ReductionRoot->setOperand(0, VectorizedTree);
3688 ReductionRoot->setOperand(1, ReductionPHI);
3689 } else
3690 ReductionRoot->replaceAllUsesWith(VectorizedTree);
3691 }
3692 return VectorizedTree != nullptr;
3693 }
3694
3695 private:
3696
3697 /// \brief Calcuate the cost of a reduction.
getReductionCost(TargetTransformInfo * TTI,Value * FirstReducedVal)3698 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3699 Type *ScalarTy = FirstReducedVal->getType();
3700 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3701
3702 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3703 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3704
3705 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3706 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3707
3708 int ScalarReduxCost =
3709 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3710
3711 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3712 << " for reduction that starts with " << *FirstReducedVal
3713 << " (It is a "
3714 << (IsPairwiseReduction ? "pairwise" : "splitting")
3715 << " reduction)\n");
3716
3717 return VecReduxCost - ScalarReduxCost;
3718 }
3719
createBinOp(IRBuilder<> & Builder,unsigned Opcode,Value * L,Value * R,const Twine & Name="")3720 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3721 Value *R, const Twine &Name = "") {
3722 if (Opcode == Instruction::FAdd)
3723 return Builder.CreateFAdd(L, R, Name);
3724 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3725 }
3726
3727 /// \brief Emit a horizontal reduction of the vectorized value.
emitReduction(Value * VectorizedValue,IRBuilder<> & Builder)3728 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3729 assert(VectorizedValue && "Need to have a vectorized tree node");
3730 assert(isPowerOf2_32(ReduxWidth) &&
3731 "We only handle power-of-two reductions for now");
3732
3733 Value *TmpVec = VectorizedValue;
3734 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3735 if (IsPairwiseReduction) {
3736 Value *LeftMask =
3737 createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3738 Value *RightMask =
3739 createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3740
3741 Value *LeftShuf = Builder.CreateShuffleVector(
3742 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3743 Value *RightShuf = Builder.CreateShuffleVector(
3744 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3745 "rdx.shuf.r");
3746 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3747 "bin.rdx");
3748 } else {
3749 Value *UpperHalf =
3750 createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3751 Value *Shuf = Builder.CreateShuffleVector(
3752 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3753 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3754 }
3755 }
3756
3757 // The result is in the first element of the vector.
3758 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3759 }
3760 };
3761
3762 /// \brief Recognize construction of vectors like
3763 /// %ra = insertelement <4 x float> undef, float %s0, i32 0
3764 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
3765 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
3766 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
3767 ///
3768 /// Returns true if it matches
3769 ///
findBuildVector(InsertElementInst * FirstInsertElem,SmallVectorImpl<Value * > & BuildVector,SmallVectorImpl<Value * > & BuildVectorOpds)3770 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3771 SmallVectorImpl<Value *> &BuildVector,
3772 SmallVectorImpl<Value *> &BuildVectorOpds) {
3773 if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3774 return false;
3775
3776 InsertElementInst *IE = FirstInsertElem;
3777 while (true) {
3778 BuildVector.push_back(IE);
3779 BuildVectorOpds.push_back(IE->getOperand(1));
3780
3781 if (IE->use_empty())
3782 return false;
3783
3784 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3785 if (!NextUse)
3786 return true;
3787
3788 // If this isn't the final use, make sure the next insertelement is the only
3789 // use. It's OK if the final constructed vector is used multiple times
3790 if (!IE->hasOneUse())
3791 return false;
3792
3793 IE = NextUse;
3794 }
3795
3796 return false;
3797 }
3798
PhiTypeSorterFunc(Value * V,Value * V2)3799 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3800 return V->getType() < V2->getType();
3801 }
3802
vectorizeChainsInBlock(BasicBlock * BB,BoUpSLP & R)3803 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3804 bool Changed = false;
3805 SmallVector<Value *, 4> Incoming;
3806 SmallSet<Value *, 16> VisitedInstrs;
3807
3808 bool HaveVectorizedPhiNodes = true;
3809 while (HaveVectorizedPhiNodes) {
3810 HaveVectorizedPhiNodes = false;
3811
3812 // Collect the incoming values from the PHIs.
3813 Incoming.clear();
3814 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3815 ++instr) {
3816 PHINode *P = dyn_cast<PHINode>(instr);
3817 if (!P)
3818 break;
3819
3820 if (!VisitedInstrs.count(P))
3821 Incoming.push_back(P);
3822 }
3823
3824 // Sort by type.
3825 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3826
3827 // Try to vectorize elements base on their type.
3828 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3829 E = Incoming.end();
3830 IncIt != E;) {
3831
3832 // Look for the next elements with the same type.
3833 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3834 while (SameTypeIt != E &&
3835 (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3836 VisitedInstrs.insert(*SameTypeIt);
3837 ++SameTypeIt;
3838 }
3839
3840 // Try to vectorize them.
3841 unsigned NumElts = (SameTypeIt - IncIt);
3842 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3843 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3844 // Success start over because instructions might have been changed.
3845 HaveVectorizedPhiNodes = true;
3846 Changed = true;
3847 break;
3848 }
3849
3850 // Start over at the next instruction of a different type (or the end).
3851 IncIt = SameTypeIt;
3852 }
3853 }
3854
3855 VisitedInstrs.clear();
3856
3857 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3858 // We may go through BB multiple times so skip the one we have checked.
3859 if (!VisitedInstrs.insert(it).second)
3860 continue;
3861
3862 if (isa<DbgInfoIntrinsic>(it))
3863 continue;
3864
3865 // Try to vectorize reductions that use PHINodes.
3866 if (PHINode *P = dyn_cast<PHINode>(it)) {
3867 // Check that the PHI is a reduction PHI.
3868 if (P->getNumIncomingValues() != 2)
3869 return Changed;
3870 Value *Rdx =
3871 (P->getIncomingBlock(0) == BB
3872 ? (P->getIncomingValue(0))
3873 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3874 : nullptr));
3875 // Check if this is a Binary Operator.
3876 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3877 if (!BI)
3878 continue;
3879
3880 // Try to match and vectorize a horizontal reduction.
3881 HorizontalReduction HorRdx;
3882 if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
3883 HorRdx.tryToReduce(R, TTI)) {
3884 Changed = true;
3885 it = BB->begin();
3886 e = BB->end();
3887 continue;
3888 }
3889
3890 Value *Inst = BI->getOperand(0);
3891 if (Inst == P)
3892 Inst = BI->getOperand(1);
3893
3894 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3895 // We would like to start over since some instructions are deleted
3896 // and the iterator may become invalid value.
3897 Changed = true;
3898 it = BB->begin();
3899 e = BB->end();
3900 continue;
3901 }
3902
3903 continue;
3904 }
3905
3906 // Try to vectorize horizontal reductions feeding into a store.
3907 if (ShouldStartVectorizeHorAtStore)
3908 if (StoreInst *SI = dyn_cast<StoreInst>(it))
3909 if (BinaryOperator *BinOp =
3910 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3911 HorizontalReduction HorRdx;
3912 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
3913 HorRdx.tryToReduce(R, TTI)) ||
3914 tryToVectorize(BinOp, R))) {
3915 Changed = true;
3916 it = BB->begin();
3917 e = BB->end();
3918 continue;
3919 }
3920 }
3921
3922 // Try to vectorize horizontal reductions feeding into a return.
3923 if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3924 if (RI->getNumOperands() != 0)
3925 if (BinaryOperator *BinOp =
3926 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3927 DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3928 if (tryToVectorizePair(BinOp->getOperand(0),
3929 BinOp->getOperand(1), R)) {
3930 Changed = true;
3931 it = BB->begin();
3932 e = BB->end();
3933 continue;
3934 }
3935 }
3936
3937 // Try to vectorize trees that start at compare instructions.
3938 if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3939 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3940 Changed = true;
3941 // We would like to start over since some instructions are deleted
3942 // and the iterator may become invalid value.
3943 it = BB->begin();
3944 e = BB->end();
3945 continue;
3946 }
3947
3948 for (int i = 0; i < 2; ++i) {
3949 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3950 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3951 Changed = true;
3952 // We would like to start over since some instructions are deleted
3953 // and the iterator may become invalid value.
3954 it = BB->begin();
3955 e = BB->end();
3956 break;
3957 }
3958 }
3959 }
3960 continue;
3961 }
3962
3963 // Try to vectorize trees that start at insertelement instructions.
3964 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3965 SmallVector<Value *, 16> BuildVector;
3966 SmallVector<Value *, 16> BuildVectorOpds;
3967 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3968 continue;
3969
3970 // Vectorize starting with the build vector operands ignoring the
3971 // BuildVector instructions for the purpose of scheduling and user
3972 // extraction.
3973 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
3974 Changed = true;
3975 it = BB->begin();
3976 e = BB->end();
3977 }
3978
3979 continue;
3980 }
3981 }
3982
3983 return Changed;
3984 }
3985
vectorizeStoreChains(BoUpSLP & R)3986 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
3987 bool Changed = false;
3988 // Attempt to sort and vectorize each of the store-groups.
3989 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
3990 it != e; ++it) {
3991 if (it->second.size() < 2)
3992 continue;
3993
3994 DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
3995 << it->second.size() << ".\n");
3996
3997 // Process the stores in chunks of 16.
3998 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
3999 unsigned Len = std::min<unsigned>(CE - CI, 16);
4000 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
4001 -SLPCostThreshold, R);
4002 }
4003 }
4004 return Changed;
4005 }
4006
4007 } // end anonymous namespace
4008
4009 char SLPVectorizer::ID = 0;
4010 static const char lv_name[] = "SLP Vectorizer";
4011 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4012 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
4013 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
4014 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4015 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4016 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4017 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4018
4019 namespace llvm {
createSLPVectorizerPass()4020 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
4021 }
4022