1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
32 using namespace llvm;
33
34 #define DEBUG_TYPE "aarch64-ldst-opt"
35
36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
37 /// load / store instructions to form ldp / stp instructions.
38
39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
40 STATISTIC(NumPostFolded, "Number of post-index updates folded");
41 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
42 STATISTIC(NumUnscaledPairCreated,
43 "Number of load/store from unscaled generated");
44 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
45 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
46
47 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
48 cl::init(20), cl::Hidden);
49
50 namespace llvm {
51 void initializeAArch64LoadStoreOptPass(PassRegistry &);
52 }
53
54 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
55
56 namespace {
57
58 typedef struct LdStPairFlags {
59 // If a matching instruction is found, MergeForward is set to true if the
60 // merge is to remove the first instruction and replace the second with
61 // a pair-wise insn, and false if the reverse is true.
62 bool MergeForward;
63
64 // SExtIdx gives the index of the result of the load pair that must be
65 // extended. The value of SExtIdx assumes that the paired load produces the
66 // value in this order: (I, returned iterator), i.e., -1 means no value has
67 // to be extended, 0 means I, and 1 means the returned iterator.
68 int SExtIdx;
69
LdStPairFlags__anond29170f90111::LdStPairFlags70 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
71
setMergeForward__anond29170f90111::LdStPairFlags72 void setMergeForward(bool V = true) { MergeForward = V; }
getMergeForward__anond29170f90111::LdStPairFlags73 bool getMergeForward() const { return MergeForward; }
74
setSExtIdx__anond29170f90111::LdStPairFlags75 void setSExtIdx(int V) { SExtIdx = V; }
getSExtIdx__anond29170f90111::LdStPairFlags76 int getSExtIdx() const { return SExtIdx; }
77
78 } LdStPairFlags;
79
80 struct AArch64LoadStoreOpt : public MachineFunctionPass {
81 static char ID;
AArch64LoadStoreOpt__anond29170f90111::AArch64LoadStoreOpt82 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
83 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
84 }
85
86 const AArch64InstrInfo *TII;
87 const TargetRegisterInfo *TRI;
88 const AArch64Subtarget *Subtarget;
89
90 // Scan the instructions looking for a load/store that can be combined
91 // with the current instruction into a load/store pair.
92 // Return the matching instruction if one is found, else MBB->end().
93 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
94 LdStPairFlags &Flags,
95 unsigned Limit);
96 // Merge the two instructions indicated into a single pair-wise instruction.
97 // If MergeForward is true, erase the first instruction and fold its
98 // operation into the second. If false, the reverse. Return the instruction
99 // following the first instruction (which may change during processing).
100 MachineBasicBlock::iterator
101 mergePairedInsns(MachineBasicBlock::iterator I,
102 MachineBasicBlock::iterator Paired,
103 const LdStPairFlags &Flags);
104
105 // Scan the instruction list to find a base register update that can
106 // be combined with the current instruction (a load or store) using
107 // pre or post indexed addressing with writeback. Scan forwards.
108 MachineBasicBlock::iterator
109 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
110 int UnscaledOffset);
111
112 // Scan the instruction list to find a base register update that can
113 // be combined with the current instruction (a load or store) using
114 // pre or post indexed addressing with writeback. Scan backwards.
115 MachineBasicBlock::iterator
116 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
117
118 // Find an instruction that updates the base register of the ld/st
119 // instruction.
120 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
121 unsigned BaseReg, int Offset);
122
123 // Merge a pre- or post-index base register update into a ld/st instruction.
124 MachineBasicBlock::iterator
125 mergeUpdateInsn(MachineBasicBlock::iterator I,
126 MachineBasicBlock::iterator Update, bool IsPreIdx);
127
128 // Find and merge foldable ldr/str instructions.
129 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
130
131 // Check if converting two narrow loads into a single wider load with
132 // bitfield extracts could be enabled.
133 bool enableNarrowLdMerge(MachineFunction &Fn);
134
135 bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
136
137 bool runOnMachineFunction(MachineFunction &Fn) override;
138
getPassName__anond29170f90111::AArch64LoadStoreOpt139 const char *getPassName() const override {
140 return AARCH64_LOAD_STORE_OPT_NAME;
141 }
142 };
143 char AArch64LoadStoreOpt::ID = 0;
144 } // namespace
145
146 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
147 AARCH64_LOAD_STORE_OPT_NAME, false, false)
148
isUnscaledLdSt(unsigned Opc)149 static bool isUnscaledLdSt(unsigned Opc) {
150 switch (Opc) {
151 default:
152 return false;
153 case AArch64::STURSi:
154 case AArch64::STURDi:
155 case AArch64::STURQi:
156 case AArch64::STURBBi:
157 case AArch64::STURHHi:
158 case AArch64::STURWi:
159 case AArch64::STURXi:
160 case AArch64::LDURSi:
161 case AArch64::LDURDi:
162 case AArch64::LDURQi:
163 case AArch64::LDURWi:
164 case AArch64::LDURXi:
165 case AArch64::LDURSWi:
166 case AArch64::LDURHHi:
167 case AArch64::LDURBBi:
168 case AArch64::LDURSBWi:
169 case AArch64::LDURSHWi:
170 return true;
171 }
172 }
173
isUnscaledLdSt(MachineInstr * MI)174 static bool isUnscaledLdSt(MachineInstr *MI) {
175 return isUnscaledLdSt(MI->getOpcode());
176 }
177
getBitExtrOpcode(MachineInstr * MI)178 static unsigned getBitExtrOpcode(MachineInstr *MI) {
179 switch (MI->getOpcode()) {
180 default:
181 llvm_unreachable("Unexpected opcode.");
182 case AArch64::LDRBBui:
183 case AArch64::LDURBBi:
184 case AArch64::LDRHHui:
185 case AArch64::LDURHHi:
186 return AArch64::UBFMWri;
187 case AArch64::LDRSBWui:
188 case AArch64::LDURSBWi:
189 case AArch64::LDRSHWui:
190 case AArch64::LDURSHWi:
191 return AArch64::SBFMWri;
192 }
193 }
194
isNarrowStore(unsigned Opc)195 static bool isNarrowStore(unsigned Opc) {
196 switch (Opc) {
197 default:
198 return false;
199 case AArch64::STRBBui:
200 case AArch64::STURBBi:
201 case AArch64::STRHHui:
202 case AArch64::STURHHi:
203 return true;
204 }
205 }
206
isNarrowStore(MachineInstr * MI)207 static bool isNarrowStore(MachineInstr *MI) {
208 return isNarrowStore(MI->getOpcode());
209 }
210
isNarrowLoad(unsigned Opc)211 static bool isNarrowLoad(unsigned Opc) {
212 switch (Opc) {
213 default:
214 return false;
215 case AArch64::LDRHHui:
216 case AArch64::LDURHHi:
217 case AArch64::LDRBBui:
218 case AArch64::LDURBBi:
219 case AArch64::LDRSHWui:
220 case AArch64::LDURSHWi:
221 case AArch64::LDRSBWui:
222 case AArch64::LDURSBWi:
223 return true;
224 }
225 }
226
isNarrowLoad(MachineInstr * MI)227 static bool isNarrowLoad(MachineInstr *MI) {
228 return isNarrowLoad(MI->getOpcode());
229 }
230
231 // Scaling factor for unscaled load or store.
getMemScale(MachineInstr * MI)232 static int getMemScale(MachineInstr *MI) {
233 switch (MI->getOpcode()) {
234 default:
235 llvm_unreachable("Opcode has unknown scale!");
236 case AArch64::LDRBBui:
237 case AArch64::LDURBBi:
238 case AArch64::LDRSBWui:
239 case AArch64::LDURSBWi:
240 case AArch64::STRBBui:
241 case AArch64::STURBBi:
242 return 1;
243 case AArch64::LDRHHui:
244 case AArch64::LDURHHi:
245 case AArch64::LDRSHWui:
246 case AArch64::LDURSHWi:
247 case AArch64::STRHHui:
248 case AArch64::STURHHi:
249 return 2;
250 case AArch64::LDRSui:
251 case AArch64::LDURSi:
252 case AArch64::LDRSWui:
253 case AArch64::LDURSWi:
254 case AArch64::LDRWui:
255 case AArch64::LDURWi:
256 case AArch64::STRSui:
257 case AArch64::STURSi:
258 case AArch64::STRWui:
259 case AArch64::STURWi:
260 case AArch64::LDPSi:
261 case AArch64::LDPSWi:
262 case AArch64::LDPWi:
263 case AArch64::STPSi:
264 case AArch64::STPWi:
265 return 4;
266 case AArch64::LDRDui:
267 case AArch64::LDURDi:
268 case AArch64::LDRXui:
269 case AArch64::LDURXi:
270 case AArch64::STRDui:
271 case AArch64::STURDi:
272 case AArch64::STRXui:
273 case AArch64::STURXi:
274 case AArch64::LDPDi:
275 case AArch64::LDPXi:
276 case AArch64::STPDi:
277 case AArch64::STPXi:
278 return 8;
279 case AArch64::LDRQui:
280 case AArch64::LDURQi:
281 case AArch64::STRQui:
282 case AArch64::STURQi:
283 case AArch64::LDPQi:
284 case AArch64::STPQi:
285 return 16;
286 }
287 }
288
getMatchingNonSExtOpcode(unsigned Opc,bool * IsValidLdStrOpc=nullptr)289 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
290 bool *IsValidLdStrOpc = nullptr) {
291 if (IsValidLdStrOpc)
292 *IsValidLdStrOpc = true;
293 switch (Opc) {
294 default:
295 if (IsValidLdStrOpc)
296 *IsValidLdStrOpc = false;
297 return UINT_MAX;
298 case AArch64::STRDui:
299 case AArch64::STURDi:
300 case AArch64::STRQui:
301 case AArch64::STURQi:
302 case AArch64::STRBBui:
303 case AArch64::STURBBi:
304 case AArch64::STRHHui:
305 case AArch64::STURHHi:
306 case AArch64::STRWui:
307 case AArch64::STURWi:
308 case AArch64::STRXui:
309 case AArch64::STURXi:
310 case AArch64::LDRDui:
311 case AArch64::LDURDi:
312 case AArch64::LDRQui:
313 case AArch64::LDURQi:
314 case AArch64::LDRWui:
315 case AArch64::LDURWi:
316 case AArch64::LDRXui:
317 case AArch64::LDURXi:
318 case AArch64::STRSui:
319 case AArch64::STURSi:
320 case AArch64::LDRSui:
321 case AArch64::LDURSi:
322 case AArch64::LDRHHui:
323 case AArch64::LDURHHi:
324 case AArch64::LDRBBui:
325 case AArch64::LDURBBi:
326 return Opc;
327 case AArch64::LDRSWui:
328 return AArch64::LDRWui;
329 case AArch64::LDURSWi:
330 return AArch64::LDURWi;
331 case AArch64::LDRSBWui:
332 return AArch64::LDRBBui;
333 case AArch64::LDRSHWui:
334 return AArch64::LDRHHui;
335 case AArch64::LDURSBWi:
336 return AArch64::LDURBBi;
337 case AArch64::LDURSHWi:
338 return AArch64::LDURHHi;
339 }
340 }
341
getMatchingPairOpcode(unsigned Opc)342 static unsigned getMatchingPairOpcode(unsigned Opc) {
343 switch (Opc) {
344 default:
345 llvm_unreachable("Opcode has no pairwise equivalent!");
346 case AArch64::STRSui:
347 case AArch64::STURSi:
348 return AArch64::STPSi;
349 case AArch64::STRDui:
350 case AArch64::STURDi:
351 return AArch64::STPDi;
352 case AArch64::STRQui:
353 case AArch64::STURQi:
354 return AArch64::STPQi;
355 case AArch64::STRBBui:
356 return AArch64::STRHHui;
357 case AArch64::STRHHui:
358 return AArch64::STRWui;
359 case AArch64::STURBBi:
360 return AArch64::STURHHi;
361 case AArch64::STURHHi:
362 return AArch64::STURWi;
363 case AArch64::STRWui:
364 case AArch64::STURWi:
365 return AArch64::STPWi;
366 case AArch64::STRXui:
367 case AArch64::STURXi:
368 return AArch64::STPXi;
369 case AArch64::LDRSui:
370 case AArch64::LDURSi:
371 return AArch64::LDPSi;
372 case AArch64::LDRDui:
373 case AArch64::LDURDi:
374 return AArch64::LDPDi;
375 case AArch64::LDRQui:
376 case AArch64::LDURQi:
377 return AArch64::LDPQi;
378 case AArch64::LDRWui:
379 case AArch64::LDURWi:
380 return AArch64::LDPWi;
381 case AArch64::LDRXui:
382 case AArch64::LDURXi:
383 return AArch64::LDPXi;
384 case AArch64::LDRSWui:
385 case AArch64::LDURSWi:
386 return AArch64::LDPSWi;
387 case AArch64::LDRHHui:
388 case AArch64::LDRSHWui:
389 return AArch64::LDRWui;
390 case AArch64::LDURHHi:
391 case AArch64::LDURSHWi:
392 return AArch64::LDURWi;
393 case AArch64::LDRBBui:
394 case AArch64::LDRSBWui:
395 return AArch64::LDRHHui;
396 case AArch64::LDURBBi:
397 case AArch64::LDURSBWi:
398 return AArch64::LDURHHi;
399 }
400 }
401
getPreIndexedOpcode(unsigned Opc)402 static unsigned getPreIndexedOpcode(unsigned Opc) {
403 switch (Opc) {
404 default:
405 llvm_unreachable("Opcode has no pre-indexed equivalent!");
406 case AArch64::STRSui:
407 return AArch64::STRSpre;
408 case AArch64::STRDui:
409 return AArch64::STRDpre;
410 case AArch64::STRQui:
411 return AArch64::STRQpre;
412 case AArch64::STRBBui:
413 return AArch64::STRBBpre;
414 case AArch64::STRHHui:
415 return AArch64::STRHHpre;
416 case AArch64::STRWui:
417 return AArch64::STRWpre;
418 case AArch64::STRXui:
419 return AArch64::STRXpre;
420 case AArch64::LDRSui:
421 return AArch64::LDRSpre;
422 case AArch64::LDRDui:
423 return AArch64::LDRDpre;
424 case AArch64::LDRQui:
425 return AArch64::LDRQpre;
426 case AArch64::LDRBBui:
427 return AArch64::LDRBBpre;
428 case AArch64::LDRHHui:
429 return AArch64::LDRHHpre;
430 case AArch64::LDRWui:
431 return AArch64::LDRWpre;
432 case AArch64::LDRXui:
433 return AArch64::LDRXpre;
434 case AArch64::LDRSWui:
435 return AArch64::LDRSWpre;
436 case AArch64::LDPSi:
437 return AArch64::LDPSpre;
438 case AArch64::LDPSWi:
439 return AArch64::LDPSWpre;
440 case AArch64::LDPDi:
441 return AArch64::LDPDpre;
442 case AArch64::LDPQi:
443 return AArch64::LDPQpre;
444 case AArch64::LDPWi:
445 return AArch64::LDPWpre;
446 case AArch64::LDPXi:
447 return AArch64::LDPXpre;
448 case AArch64::STPSi:
449 return AArch64::STPSpre;
450 case AArch64::STPDi:
451 return AArch64::STPDpre;
452 case AArch64::STPQi:
453 return AArch64::STPQpre;
454 case AArch64::STPWi:
455 return AArch64::STPWpre;
456 case AArch64::STPXi:
457 return AArch64::STPXpre;
458 }
459 }
460
getPostIndexedOpcode(unsigned Opc)461 static unsigned getPostIndexedOpcode(unsigned Opc) {
462 switch (Opc) {
463 default:
464 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
465 case AArch64::STRSui:
466 return AArch64::STRSpost;
467 case AArch64::STRDui:
468 return AArch64::STRDpost;
469 case AArch64::STRQui:
470 return AArch64::STRQpost;
471 case AArch64::STRBBui:
472 return AArch64::STRBBpost;
473 case AArch64::STRHHui:
474 return AArch64::STRHHpost;
475 case AArch64::STRWui:
476 return AArch64::STRWpost;
477 case AArch64::STRXui:
478 return AArch64::STRXpost;
479 case AArch64::LDRSui:
480 return AArch64::LDRSpost;
481 case AArch64::LDRDui:
482 return AArch64::LDRDpost;
483 case AArch64::LDRQui:
484 return AArch64::LDRQpost;
485 case AArch64::LDRBBui:
486 return AArch64::LDRBBpost;
487 case AArch64::LDRHHui:
488 return AArch64::LDRHHpost;
489 case AArch64::LDRWui:
490 return AArch64::LDRWpost;
491 case AArch64::LDRXui:
492 return AArch64::LDRXpost;
493 case AArch64::LDRSWui:
494 return AArch64::LDRSWpost;
495 case AArch64::LDPSi:
496 return AArch64::LDPSpost;
497 case AArch64::LDPSWi:
498 return AArch64::LDPSWpost;
499 case AArch64::LDPDi:
500 return AArch64::LDPDpost;
501 case AArch64::LDPQi:
502 return AArch64::LDPQpost;
503 case AArch64::LDPWi:
504 return AArch64::LDPWpost;
505 case AArch64::LDPXi:
506 return AArch64::LDPXpost;
507 case AArch64::STPSi:
508 return AArch64::STPSpost;
509 case AArch64::STPDi:
510 return AArch64::STPDpost;
511 case AArch64::STPQi:
512 return AArch64::STPQpost;
513 case AArch64::STPWi:
514 return AArch64::STPWpost;
515 case AArch64::STPXi:
516 return AArch64::STPXpost;
517 }
518 }
519
isPairedLdSt(const MachineInstr * MI)520 static bool isPairedLdSt(const MachineInstr *MI) {
521 switch (MI->getOpcode()) {
522 default:
523 return false;
524 case AArch64::LDPSi:
525 case AArch64::LDPSWi:
526 case AArch64::LDPDi:
527 case AArch64::LDPQi:
528 case AArch64::LDPWi:
529 case AArch64::LDPXi:
530 case AArch64::STPSi:
531 case AArch64::STPDi:
532 case AArch64::STPQi:
533 case AArch64::STPWi:
534 case AArch64::STPXi:
535 return true;
536 }
537 }
538
getLdStRegOp(const MachineInstr * MI,unsigned PairedRegOp=0)539 static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
540 unsigned PairedRegOp = 0) {
541 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
542 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
543 return MI->getOperand(Idx);
544 }
545
getLdStBaseOp(const MachineInstr * MI)546 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
547 unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
548 return MI->getOperand(Idx);
549 }
550
getLdStOffsetOp(const MachineInstr * MI)551 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
552 unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
553 return MI->getOperand(Idx);
554 }
555
556 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
concatenateMemOperands(MachineInstr * MI,MachineInstr * Op0,MachineInstr * Op1)557 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
558 MachineInstr *Op1) {
559 assert(MI->memoperands_empty() && "expected a new machineinstr");
560 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
561 (Op1->memoperands_end() - Op1->memoperands_begin());
562
563 MachineFunction *MF = MI->getParent()->getParent();
564 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
565 MachineSDNode::mmo_iterator MemEnd =
566 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
567 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
568 MI->setMemRefs(MemBegin, MemEnd);
569 }
570
571 MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,const LdStPairFlags & Flags)572 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
573 MachineBasicBlock::iterator Paired,
574 const LdStPairFlags &Flags) {
575 MachineBasicBlock::iterator NextI = I;
576 ++NextI;
577 // If NextI is the second of the two instructions to be merged, we need
578 // to skip one further. Either way we merge will invalidate the iterator,
579 // and we don't need to scan the new instruction, as it's a pairwise
580 // instruction, which we're not considering for further action anyway.
581 if (NextI == Paired)
582 ++NextI;
583
584 int SExtIdx = Flags.getSExtIdx();
585 unsigned Opc =
586 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
587 bool IsUnscaled = isUnscaledLdSt(Opc);
588 int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
589
590 bool MergeForward = Flags.getMergeForward();
591 unsigned NewOpc = getMatchingPairOpcode(Opc);
592 // Insert our new paired instruction after whichever of the paired
593 // instructions MergeForward indicates.
594 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
595 // Also based on MergeForward is from where we copy the base register operand
596 // so we get the flags compatible with the input code.
597 const MachineOperand &BaseRegOp =
598 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
599
600 // Which register is Rt and which is Rt2 depends on the offset order.
601 MachineInstr *RtMI, *Rt2MI;
602 if (getLdStOffsetOp(I).getImm() ==
603 getLdStOffsetOp(Paired).getImm() + OffsetStride) {
604 RtMI = Paired;
605 Rt2MI = I;
606 // Here we swapped the assumption made for SExtIdx.
607 // I.e., we turn ldp I, Paired into ldp Paired, I.
608 // Update the index accordingly.
609 if (SExtIdx != -1)
610 SExtIdx = (SExtIdx + 1) % 2;
611 } else {
612 RtMI = I;
613 Rt2MI = Paired;
614 }
615
616 int OffsetImm = getLdStOffsetOp(RtMI).getImm();
617
618 if (isNarrowLoad(Opc)) {
619 // Change the scaled offset from small to large type.
620 if (!IsUnscaled) {
621 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
622 OffsetImm /= 2;
623 }
624 MachineInstr *RtNewDest = MergeForward ? I : Paired;
625 // When merging small (< 32 bit) loads for big-endian targets, the order of
626 // the component parts gets swapped.
627 if (!Subtarget->isLittleEndian())
628 std::swap(RtMI, Rt2MI);
629 // Construct the new load instruction.
630 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
631 NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
632 TII->get(NewOpc))
633 .addOperand(getLdStRegOp(RtNewDest))
634 .addOperand(BaseRegOp)
635 .addImm(OffsetImm);
636
637 // Copy MachineMemOperands from the original loads.
638 concatenateMemOperands(NewMemMI, I, Paired);
639
640 DEBUG(
641 dbgs()
642 << "Creating the new load and extract. Replacing instructions:\n ");
643 DEBUG(I->print(dbgs()));
644 DEBUG(dbgs() << " ");
645 DEBUG(Paired->print(dbgs()));
646 DEBUG(dbgs() << " with instructions:\n ");
647 DEBUG((NewMemMI)->print(dbgs()));
648
649 int Width = getMemScale(I) == 1 ? 8 : 16;
650 int LSBLow = 0;
651 int LSBHigh = Width;
652 int ImmsLow = LSBLow + Width - 1;
653 int ImmsHigh = LSBHigh + Width - 1;
654 MachineInstr *ExtDestMI = MergeForward ? Paired : I;
655 if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
656 // Create the bitfield extract for high bits.
657 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
658 TII->get(getBitExtrOpcode(Rt2MI)))
659 .addOperand(getLdStRegOp(Rt2MI))
660 .addReg(getLdStRegOp(RtNewDest).getReg())
661 .addImm(LSBHigh)
662 .addImm(ImmsHigh);
663 // Create the bitfield extract for low bits.
664 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
665 // For unsigned, prefer to use AND for low bits.
666 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
667 TII->get(AArch64::ANDWri))
668 .addOperand(getLdStRegOp(RtMI))
669 .addReg(getLdStRegOp(RtNewDest).getReg())
670 .addImm(ImmsLow);
671 } else {
672 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
673 TII->get(getBitExtrOpcode(RtMI)))
674 .addOperand(getLdStRegOp(RtMI))
675 .addReg(getLdStRegOp(RtNewDest).getReg())
676 .addImm(LSBLow)
677 .addImm(ImmsLow);
678 }
679 } else {
680 // Create the bitfield extract for low bits.
681 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
682 // For unsigned, prefer to use AND for low bits.
683 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
684 TII->get(AArch64::ANDWri))
685 .addOperand(getLdStRegOp(RtMI))
686 .addReg(getLdStRegOp(RtNewDest).getReg())
687 .addImm(ImmsLow);
688 } else {
689 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
690 TII->get(getBitExtrOpcode(RtMI)))
691 .addOperand(getLdStRegOp(RtMI))
692 .addReg(getLdStRegOp(RtNewDest).getReg())
693 .addImm(LSBLow)
694 .addImm(ImmsLow);
695 }
696
697 // Create the bitfield extract for high bits.
698 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
699 TII->get(getBitExtrOpcode(Rt2MI)))
700 .addOperand(getLdStRegOp(Rt2MI))
701 .addReg(getLdStRegOp(RtNewDest).getReg())
702 .addImm(LSBHigh)
703 .addImm(ImmsHigh);
704 }
705 DEBUG(dbgs() << " ");
706 DEBUG((BitExtMI1)->print(dbgs()));
707 DEBUG(dbgs() << " ");
708 DEBUG((BitExtMI2)->print(dbgs()));
709 DEBUG(dbgs() << "\n");
710
711 // Erase the old instructions.
712 I->eraseFromParent();
713 Paired->eraseFromParent();
714 return NextI;
715 }
716
717 // Construct the new instruction.
718 MachineInstrBuilder MIB;
719 if (isNarrowStore(Opc)) {
720 // Change the scaled offset from small to large type.
721 if (!IsUnscaled) {
722 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
723 OffsetImm /= 2;
724 }
725 MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
726 TII->get(NewOpc))
727 .addOperand(getLdStRegOp(I))
728 .addOperand(BaseRegOp)
729 .addImm(OffsetImm);
730 // Copy MachineMemOperands from the original stores.
731 concatenateMemOperands(MIB, I, Paired);
732 } else {
733 // Handle Unscaled
734 if (IsUnscaled)
735 OffsetImm /= OffsetStride;
736 MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
737 TII->get(NewOpc))
738 .addOperand(getLdStRegOp(RtMI))
739 .addOperand(getLdStRegOp(Rt2MI))
740 .addOperand(BaseRegOp)
741 .addImm(OffsetImm);
742 }
743
744 (void)MIB;
745
746 // FIXME: Do we need/want to copy the mem operands from the source
747 // instructions? Probably. What uses them after this?
748
749 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
750 DEBUG(I->print(dbgs()));
751 DEBUG(dbgs() << " ");
752 DEBUG(Paired->print(dbgs()));
753 DEBUG(dbgs() << " with instruction:\n ");
754
755 if (SExtIdx != -1) {
756 // Generate the sign extension for the proper result of the ldp.
757 // I.e., with X1, that would be:
758 // %W1<def> = KILL %W1, %X1<imp-def>
759 // %X1<def> = SBFMXri %X1<kill>, 0, 31
760 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
761 // Right now, DstMO has the extended register, since it comes from an
762 // extended opcode.
763 unsigned DstRegX = DstMO.getReg();
764 // Get the W variant of that register.
765 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
766 // Update the result of LDP to use the W instead of the X variant.
767 DstMO.setReg(DstRegW);
768 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
769 DEBUG(dbgs() << "\n");
770 // Make the machine verifier happy by providing a definition for
771 // the X register.
772 // Insert this definition right after the generated LDP, i.e., before
773 // InsertionPoint.
774 MachineInstrBuilder MIBKill =
775 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
776 TII->get(TargetOpcode::KILL), DstRegW)
777 .addReg(DstRegW)
778 .addReg(DstRegX, RegState::Define);
779 MIBKill->getOperand(2).setImplicit();
780 // Create the sign extension.
781 MachineInstrBuilder MIBSXTW =
782 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
783 TII->get(AArch64::SBFMXri), DstRegX)
784 .addReg(DstRegX)
785 .addImm(0)
786 .addImm(31);
787 (void)MIBSXTW;
788 DEBUG(dbgs() << " Extend operand:\n ");
789 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
790 DEBUG(dbgs() << "\n");
791 } else {
792 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
793 DEBUG(dbgs() << "\n");
794 }
795
796 // Erase the old instructions.
797 I->eraseFromParent();
798 Paired->eraseFromParent();
799
800 return NextI;
801 }
802
803 /// trackRegDefsUses - Remember what registers the specified instruction uses
804 /// and modifies.
trackRegDefsUses(const MachineInstr * MI,BitVector & ModifiedRegs,BitVector & UsedRegs,const TargetRegisterInfo * TRI)805 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
806 BitVector &UsedRegs,
807 const TargetRegisterInfo *TRI) {
808 for (const MachineOperand &MO : MI->operands()) {
809 if (MO.isRegMask())
810 ModifiedRegs.setBitsNotInMask(MO.getRegMask());
811
812 if (!MO.isReg())
813 continue;
814 unsigned Reg = MO.getReg();
815 if (MO.isDef()) {
816 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
817 ModifiedRegs.set(*AI);
818 } else {
819 assert(MO.isUse() && "Reg operand not a def and not a use?!?");
820 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
821 UsedRegs.set(*AI);
822 }
823 }
824 }
825
inBoundsForPair(bool IsUnscaled,int Offset,int OffsetStride)826 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
827 // Convert the byte-offset used by unscaled into an "element" offset used
828 // by the scaled pair load/store instructions.
829 if (IsUnscaled)
830 Offset /= OffsetStride;
831
832 return Offset <= 63 && Offset >= -64;
833 }
834
835 // Do alignment, specialized to power of 2 and for signed ints,
836 // avoiding having to do a C-style cast from uint_64t to int when
837 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
838 // FIXME: Move this function to include/MathExtras.h?
alignTo(int Num,int PowOf2)839 static int alignTo(int Num, int PowOf2) {
840 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
841 }
842
mayAlias(MachineInstr * MIa,MachineInstr * MIb,const AArch64InstrInfo * TII)843 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
844 const AArch64InstrInfo *TII) {
845 // One of the instructions must modify memory.
846 if (!MIa->mayStore() && !MIb->mayStore())
847 return false;
848
849 // Both instructions must be memory operations.
850 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
851 return false;
852
853 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
854 }
855
mayAlias(MachineInstr * MIa,SmallVectorImpl<MachineInstr * > & MemInsns,const AArch64InstrInfo * TII)856 static bool mayAlias(MachineInstr *MIa,
857 SmallVectorImpl<MachineInstr *> &MemInsns,
858 const AArch64InstrInfo *TII) {
859 for (auto &MIb : MemInsns)
860 if (mayAlias(MIa, MIb, TII))
861 return true;
862
863 return false;
864 }
865
866 /// findMatchingInsn - Scan the instructions looking for a load/store that can
867 /// be combined with the current instruction into a load/store pair.
868 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I,LdStPairFlags & Flags,unsigned Limit)869 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
870 LdStPairFlags &Flags, unsigned Limit) {
871 MachineBasicBlock::iterator E = I->getParent()->end();
872 MachineBasicBlock::iterator MBBI = I;
873 MachineInstr *FirstMI = I;
874 ++MBBI;
875
876 unsigned Opc = FirstMI->getOpcode();
877 bool MayLoad = FirstMI->mayLoad();
878 bool IsUnscaled = isUnscaledLdSt(FirstMI);
879 unsigned Reg = getLdStRegOp(FirstMI).getReg();
880 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
881 int Offset = getLdStOffsetOp(FirstMI).getImm();
882 bool IsNarrowStore = isNarrowStore(Opc);
883
884 // For narrow stores, find only the case where the stored value is WZR.
885 if (IsNarrowStore && Reg != AArch64::WZR)
886 return E;
887
888 // Early exit if the first instruction modifies the base register.
889 // e.g., ldr x0, [x0]
890 if (FirstMI->modifiesRegister(BaseReg, TRI))
891 return E;
892
893 // Early exit if the offset if not possible to match. (6 bits of positive
894 // range, plus allow an extra one in case we find a later insn that matches
895 // with Offset-1)
896 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
897 if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
898 !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
899 return E;
900
901 // Track which registers have been modified and used between the first insn
902 // (inclusive) and the second insn.
903 BitVector ModifiedRegs, UsedRegs;
904 ModifiedRegs.resize(TRI->getNumRegs());
905 UsedRegs.resize(TRI->getNumRegs());
906
907 // Remember any instructions that read/write memory between FirstMI and MI.
908 SmallVector<MachineInstr *, 4> MemInsns;
909
910 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
911 MachineInstr *MI = MBBI;
912 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
913 // optimization by changing how far we scan.
914 if (MI->isDebugValue())
915 continue;
916
917 // Now that we know this is a real instruction, count it.
918 ++Count;
919
920 bool CanMergeOpc = Opc == MI->getOpcode();
921 Flags.setSExtIdx(-1);
922 if (!CanMergeOpc) {
923 bool IsValidLdStrOpc;
924 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
925 assert(IsValidLdStrOpc &&
926 "Given Opc should be a Load or Store with an immediate");
927 // Opc will be the first instruction in the pair.
928 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
929 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
930 }
931
932 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
933 assert(MI->mayLoadOrStore() && "Expected memory operation.");
934 // If we've found another instruction with the same opcode, check to see
935 // if the base and offset are compatible with our starting instruction.
936 // These instructions all have scaled immediate operands, so we just
937 // check for +1/-1. Make sure to check the new instruction offset is
938 // actually an immediate and not a symbolic reference destined for
939 // a relocation.
940 //
941 // Pairwise instructions have a 7-bit signed offset field. Single insns
942 // have a 12-bit unsigned offset field. To be a valid combine, the
943 // final offset must be in range.
944 unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
945 int MIOffset = getLdStOffsetOp(MI).getImm();
946 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
947 (Offset + OffsetStride == MIOffset))) {
948 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
949 // If this is a volatile load/store that otherwise matched, stop looking
950 // as something is going on that we don't have enough information to
951 // safely transform. Similarly, stop if we see a hint to avoid pairs.
952 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
953 return E;
954 // If the resultant immediate offset of merging these instructions
955 // is out of range for a pairwise instruction, bail and keep looking.
956 bool MIIsUnscaled = isUnscaledLdSt(MI);
957 bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
958 if (!IsNarrowLoad &&
959 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
960 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
961 MemInsns.push_back(MI);
962 continue;
963 }
964
965 if (IsNarrowLoad || IsNarrowStore) {
966 // If the alignment requirements of the scaled wide load/store
967 // instruction can't express the offset of the scaled narrow
968 // input, bail and keep looking.
969 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
970 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
971 MemInsns.push_back(MI);
972 continue;
973 }
974 } else {
975 // If the alignment requirements of the paired (scaled) instruction
976 // can't express the offset of the unscaled input, bail and keep
977 // looking.
978 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
979 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
980 MemInsns.push_back(MI);
981 continue;
982 }
983 }
984 // If the destination register of the loads is the same register, bail
985 // and keep looking. A load-pair instruction with both destination
986 // registers the same is UNPREDICTABLE and will result in an exception.
987 // For narrow stores, allow only when the stored value is the same
988 // (i.e., WZR).
989 if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
990 (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
991 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
992 MemInsns.push_back(MI);
993 continue;
994 }
995
996 // If the Rt of the second instruction was not modified or used between
997 // the two instructions and none of the instructions between the second
998 // and first alias with the second, we can combine the second into the
999 // first.
1000 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1001 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1002 !mayAlias(MI, MemInsns, TII)) {
1003 Flags.setMergeForward(false);
1004 return MBBI;
1005 }
1006
1007 // Likewise, if the Rt of the first instruction is not modified or used
1008 // between the two instructions and none of the instructions between the
1009 // first and the second alias with the first, we can combine the first
1010 // into the second.
1011 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1012 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1013 !mayAlias(FirstMI, MemInsns, TII)) {
1014 Flags.setMergeForward(true);
1015 return MBBI;
1016 }
1017 // Unable to combine these instructions due to interference in between.
1018 // Keep looking.
1019 }
1020 }
1021
1022 // If the instruction wasn't a matching load or store. Stop searching if we
1023 // encounter a call instruction that might modify memory.
1024 if (MI->isCall())
1025 return E;
1026
1027 // Update modified / uses register lists.
1028 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1029
1030 // Otherwise, if the base register is modified, we have no match, so
1031 // return early.
1032 if (ModifiedRegs[BaseReg])
1033 return E;
1034
1035 // Update list of instructions that read/write memory.
1036 if (MI->mayLoadOrStore())
1037 MemInsns.push_back(MI);
1038 }
1039 return E;
1040 }
1041
1042 MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Update,bool IsPreIdx)1043 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1044 MachineBasicBlock::iterator Update,
1045 bool IsPreIdx) {
1046 assert((Update->getOpcode() == AArch64::ADDXri ||
1047 Update->getOpcode() == AArch64::SUBXri) &&
1048 "Unexpected base register update instruction to merge!");
1049 MachineBasicBlock::iterator NextI = I;
1050 // Return the instruction following the merged instruction, which is
1051 // the instruction following our unmerged load. Unless that's the add/sub
1052 // instruction we're merging, in which case it's the one after that.
1053 if (++NextI == Update)
1054 ++NextI;
1055
1056 int Value = Update->getOperand(2).getImm();
1057 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1058 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1059 if (Update->getOpcode() == AArch64::SUBXri)
1060 Value = -Value;
1061
1062 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1063 : getPostIndexedOpcode(I->getOpcode());
1064 MachineInstrBuilder MIB;
1065 if (!isPairedLdSt(I)) {
1066 // Non-paired instruction.
1067 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1068 .addOperand(getLdStRegOp(Update))
1069 .addOperand(getLdStRegOp(I))
1070 .addOperand(getLdStBaseOp(I))
1071 .addImm(Value);
1072 } else {
1073 // Paired instruction.
1074 int Scale = getMemScale(I);
1075 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1076 .addOperand(getLdStRegOp(Update))
1077 .addOperand(getLdStRegOp(I, 0))
1078 .addOperand(getLdStRegOp(I, 1))
1079 .addOperand(getLdStBaseOp(I))
1080 .addImm(Value / Scale);
1081 }
1082 (void)MIB;
1083
1084 if (IsPreIdx)
1085 DEBUG(dbgs() << "Creating pre-indexed load/store.");
1086 else
1087 DEBUG(dbgs() << "Creating post-indexed load/store.");
1088 DEBUG(dbgs() << " Replacing instructions:\n ");
1089 DEBUG(I->print(dbgs()));
1090 DEBUG(dbgs() << " ");
1091 DEBUG(Update->print(dbgs()));
1092 DEBUG(dbgs() << " with instruction:\n ");
1093 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1094 DEBUG(dbgs() << "\n");
1095
1096 // Erase the old instructions for the block.
1097 I->eraseFromParent();
1098 Update->eraseFromParent();
1099
1100 return NextI;
1101 }
1102
isMatchingUpdateInsn(MachineInstr * MemMI,MachineInstr * MI,unsigned BaseReg,int Offset)1103 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
1104 MachineInstr *MI,
1105 unsigned BaseReg, int Offset) {
1106 switch (MI->getOpcode()) {
1107 default:
1108 break;
1109 case AArch64::SUBXri:
1110 // Negate the offset for a SUB instruction.
1111 Offset *= -1;
1112 // FALLTHROUGH
1113 case AArch64::ADDXri:
1114 // Make sure it's a vanilla immediate operand, not a relocation or
1115 // anything else we can't handle.
1116 if (!MI->getOperand(2).isImm())
1117 break;
1118 // Watch out for 1 << 12 shifted value.
1119 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
1120 break;
1121
1122 // The update instruction source and destination register must be the
1123 // same as the load/store base register.
1124 if (MI->getOperand(0).getReg() != BaseReg ||
1125 MI->getOperand(1).getReg() != BaseReg)
1126 break;
1127
1128 bool IsPairedInsn = isPairedLdSt(MemMI);
1129 int UpdateOffset = MI->getOperand(2).getImm();
1130 // For non-paired load/store instructions, the immediate must fit in a
1131 // signed 9-bit integer.
1132 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1133 break;
1134
1135 // For paired load/store instructions, the immediate must be a multiple of
1136 // the scaling factor. The scaled offset must also fit into a signed 7-bit
1137 // integer.
1138 if (IsPairedInsn) {
1139 int Scale = getMemScale(MemMI);
1140 if (UpdateOffset % Scale != 0)
1141 break;
1142
1143 int ScaledOffset = UpdateOffset / Scale;
1144 if (ScaledOffset > 64 || ScaledOffset < -64)
1145 break;
1146 }
1147
1148 // If we have a non-zero Offset, we check that it matches the amount
1149 // we're adding to the register.
1150 if (!Offset || Offset == MI->getOperand(2).getImm())
1151 return true;
1152 break;
1153 }
1154 return false;
1155 }
1156
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,unsigned Limit,int UnscaledOffset)1157 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1158 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
1159 MachineBasicBlock::iterator E = I->getParent()->end();
1160 MachineInstr *MemMI = I;
1161 MachineBasicBlock::iterator MBBI = I;
1162
1163 unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1164 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1165
1166 // Scan forward looking for post-index opportunities. Updating instructions
1167 // can't be formed if the memory instruction doesn't have the offset we're
1168 // looking for.
1169 if (MIUnscaledOffset != UnscaledOffset)
1170 return E;
1171
1172 // If the base register overlaps a destination register, we can't
1173 // merge the update.
1174 bool IsPairedInsn = isPairedLdSt(MemMI);
1175 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1176 unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1177 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1178 return E;
1179 }
1180
1181 // Track which registers have been modified and used between the first insn
1182 // (inclusive) and the second insn.
1183 BitVector ModifiedRegs, UsedRegs;
1184 ModifiedRegs.resize(TRI->getNumRegs());
1185 UsedRegs.resize(TRI->getNumRegs());
1186 ++MBBI;
1187 for (unsigned Count = 0; MBBI != E; ++MBBI) {
1188 MachineInstr *MI = MBBI;
1189 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1190 // optimization by changing how far we scan.
1191 if (MI->isDebugValue())
1192 continue;
1193
1194 // Now that we know this is a real instruction, count it.
1195 ++Count;
1196
1197 // If we found a match, return it.
1198 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
1199 return MBBI;
1200
1201 // Update the status of what the instruction clobbered and used.
1202 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1203
1204 // Otherwise, if the base register is used or modified, we have no match, so
1205 // return early.
1206 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1207 return E;
1208 }
1209 return E;
1210 }
1211
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,unsigned Limit)1212 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1213 MachineBasicBlock::iterator I, unsigned Limit) {
1214 MachineBasicBlock::iterator B = I->getParent()->begin();
1215 MachineBasicBlock::iterator E = I->getParent()->end();
1216 MachineInstr *MemMI = I;
1217 MachineBasicBlock::iterator MBBI = I;
1218
1219 unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1220 int Offset = getLdStOffsetOp(MemMI).getImm();
1221
1222 // If the load/store is the first instruction in the block, there's obviously
1223 // not any matching update. Ditto if the memory offset isn't zero.
1224 if (MBBI == B || Offset != 0)
1225 return E;
1226 // If the base register overlaps a destination register, we can't
1227 // merge the update.
1228 bool IsPairedInsn = isPairedLdSt(MemMI);
1229 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1230 unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1231 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1232 return E;
1233 }
1234
1235 // Track which registers have been modified and used between the first insn
1236 // (inclusive) and the second insn.
1237 BitVector ModifiedRegs, UsedRegs;
1238 ModifiedRegs.resize(TRI->getNumRegs());
1239 UsedRegs.resize(TRI->getNumRegs());
1240 --MBBI;
1241 for (unsigned Count = 0; MBBI != B; --MBBI) {
1242 MachineInstr *MI = MBBI;
1243 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1244 // optimization by changing how far we scan.
1245 if (MI->isDebugValue())
1246 continue;
1247
1248 // Now that we know this is a real instruction, count it.
1249 ++Count;
1250
1251 // If we found a match, return it.
1252 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
1253 return MBBI;
1254
1255 // Update the status of what the instruction clobbered and used.
1256 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1257
1258 // Otherwise, if the base register is used or modified, we have no match, so
1259 // return early.
1260 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1261 return E;
1262 }
1263 return E;
1264 }
1265
tryToMergeLdStInst(MachineBasicBlock::iterator & MBBI)1266 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1267 MachineBasicBlock::iterator &MBBI) {
1268 MachineInstr *MI = MBBI;
1269 MachineBasicBlock::iterator E = MI->getParent()->end();
1270 // If this is a volatile load/store, don't mess with it.
1271 if (MI->hasOrderedMemoryRef())
1272 return false;
1273
1274 // Make sure this is a reg+imm (as opposed to an address reloc).
1275 if (!getLdStOffsetOp(MI).isImm())
1276 return false;
1277
1278 // Check if this load/store has a hint to avoid pair formation.
1279 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1280 if (TII->isLdStPairSuppressed(MI))
1281 return false;
1282
1283 // Look ahead up to ScanLimit instructions for a pairable instruction.
1284 LdStPairFlags Flags;
1285 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
1286 if (Paired != E) {
1287 if (isNarrowLoad(MI)) {
1288 ++NumNarrowLoadsPromoted;
1289 } else if (isNarrowStore(MI)) {
1290 ++NumZeroStoresPromoted;
1291 } else {
1292 ++NumPairCreated;
1293 if (isUnscaledLdSt(MI))
1294 ++NumUnscaledPairCreated;
1295 }
1296
1297 // Merge the loads into a pair. Keeping the iterator straight is a
1298 // pain, so we let the merge routine tell us what the next instruction
1299 // is after it's done mucking about.
1300 MBBI = mergePairedInsns(MBBI, Paired, Flags);
1301 return true;
1302 }
1303 return false;
1304 }
1305
optimizeBlock(MachineBasicBlock & MBB,bool enableNarrowLdOpt)1306 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1307 bool enableNarrowLdOpt) {
1308 bool Modified = false;
1309 // Three tranformations to do here:
1310 // 1) Find narrow loads that can be converted into a single wider load
1311 // with bitfield extract instructions.
1312 // e.g.,
1313 // ldrh w0, [x2]
1314 // ldrh w1, [x2, #2]
1315 // ; becomes
1316 // ldr w0, [x2]
1317 // ubfx w1, w0, #16, #16
1318 // and w0, w0, #ffff
1319 // 2) Find loads and stores that can be merged into a single load or store
1320 // pair instruction.
1321 // e.g.,
1322 // ldr x0, [x2]
1323 // ldr x1, [x2, #8]
1324 // ; becomes
1325 // ldp x0, x1, [x2]
1326 // 3) Find base register updates that can be merged into the load or store
1327 // as a base-reg writeback.
1328 // e.g.,
1329 // ldr x0, [x2]
1330 // add x2, x2, #4
1331 // ; becomes
1332 // ldr x0, [x2], #4
1333
1334 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1335 enableNarrowLdOpt && MBBI != E;) {
1336 MachineInstr *MI = MBBI;
1337 switch (MI->getOpcode()) {
1338 default:
1339 // Just move on to the next instruction.
1340 ++MBBI;
1341 break;
1342 // Scaled instructions.
1343 case AArch64::LDRBBui:
1344 case AArch64::LDRHHui:
1345 case AArch64::LDRSBWui:
1346 case AArch64::LDRSHWui:
1347 case AArch64::STRBBui:
1348 case AArch64::STRHHui:
1349 // Unscaled instructions.
1350 case AArch64::LDURBBi:
1351 case AArch64::LDURHHi:
1352 case AArch64::LDURSBWi:
1353 case AArch64::LDURSHWi:
1354 case AArch64::STURBBi:
1355 case AArch64::STURHHi: {
1356 if (tryToMergeLdStInst(MBBI)) {
1357 Modified = true;
1358 break;
1359 }
1360 ++MBBI;
1361 break;
1362 }
1363 // FIXME: Do the other instructions.
1364 }
1365 }
1366
1367 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1368 MBBI != E;) {
1369 MachineInstr *MI = MBBI;
1370 switch (MI->getOpcode()) {
1371 default:
1372 // Just move on to the next instruction.
1373 ++MBBI;
1374 break;
1375 // Scaled instructions.
1376 case AArch64::STRSui:
1377 case AArch64::STRDui:
1378 case AArch64::STRQui:
1379 case AArch64::STRXui:
1380 case AArch64::STRWui:
1381 case AArch64::LDRSui:
1382 case AArch64::LDRDui:
1383 case AArch64::LDRQui:
1384 case AArch64::LDRXui:
1385 case AArch64::LDRWui:
1386 case AArch64::LDRSWui:
1387 // Unscaled instructions.
1388 case AArch64::STURSi:
1389 case AArch64::STURDi:
1390 case AArch64::STURQi:
1391 case AArch64::STURWi:
1392 case AArch64::STURXi:
1393 case AArch64::LDURSi:
1394 case AArch64::LDURDi:
1395 case AArch64::LDURQi:
1396 case AArch64::LDURWi:
1397 case AArch64::LDURXi:
1398 case AArch64::LDURSWi: {
1399 if (tryToMergeLdStInst(MBBI)) {
1400 Modified = true;
1401 break;
1402 }
1403 ++MBBI;
1404 break;
1405 }
1406 // FIXME: Do the other instructions.
1407 }
1408 }
1409
1410 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1411 MBBI != E;) {
1412 MachineInstr *MI = MBBI;
1413 // Do update merging. It's simpler to keep this separate from the above
1414 // switch, though not strictly necessary.
1415 unsigned Opc = MI->getOpcode();
1416 switch (Opc) {
1417 default:
1418 // Just move on to the next instruction.
1419 ++MBBI;
1420 break;
1421 // Scaled instructions.
1422 case AArch64::STRSui:
1423 case AArch64::STRDui:
1424 case AArch64::STRQui:
1425 case AArch64::STRXui:
1426 case AArch64::STRWui:
1427 case AArch64::STRHHui:
1428 case AArch64::STRBBui:
1429 case AArch64::LDRSui:
1430 case AArch64::LDRDui:
1431 case AArch64::LDRQui:
1432 case AArch64::LDRXui:
1433 case AArch64::LDRWui:
1434 case AArch64::LDRHHui:
1435 case AArch64::LDRBBui:
1436 // Unscaled instructions.
1437 case AArch64::STURSi:
1438 case AArch64::STURDi:
1439 case AArch64::STURQi:
1440 case AArch64::STURWi:
1441 case AArch64::STURXi:
1442 case AArch64::LDURSi:
1443 case AArch64::LDURDi:
1444 case AArch64::LDURQi:
1445 case AArch64::LDURWi:
1446 case AArch64::LDURXi:
1447 // Paired instructions.
1448 case AArch64::LDPSi:
1449 case AArch64::LDPSWi:
1450 case AArch64::LDPDi:
1451 case AArch64::LDPQi:
1452 case AArch64::LDPWi:
1453 case AArch64::LDPXi:
1454 case AArch64::STPSi:
1455 case AArch64::STPDi:
1456 case AArch64::STPQi:
1457 case AArch64::STPWi:
1458 case AArch64::STPXi: {
1459 // Make sure this is a reg+imm (as opposed to an address reloc).
1460 if (!getLdStOffsetOp(MI).isImm()) {
1461 ++MBBI;
1462 break;
1463 }
1464 // Look forward to try to form a post-index instruction. For example,
1465 // ldr x0, [x20]
1466 // add x20, x20, #32
1467 // merged into:
1468 // ldr x0, [x20], #32
1469 MachineBasicBlock::iterator Update =
1470 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
1471 if (Update != E) {
1472 // Merge the update into the ld/st.
1473 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1474 Modified = true;
1475 ++NumPostFolded;
1476 break;
1477 }
1478 // Don't know how to handle pre/post-index versions, so move to the next
1479 // instruction.
1480 if (isUnscaledLdSt(Opc)) {
1481 ++MBBI;
1482 break;
1483 }
1484
1485 // Look back to try to find a pre-index instruction. For example,
1486 // add x0, x0, #8
1487 // ldr x1, [x0]
1488 // merged into:
1489 // ldr x1, [x0, #8]!
1490 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
1491 if (Update != E) {
1492 // Merge the update into the ld/st.
1493 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1494 Modified = true;
1495 ++NumPreFolded;
1496 break;
1497 }
1498 // The immediate in the load/store is scaled by the size of the memory
1499 // operation. The immediate in the add we're looking for,
1500 // however, is not, so adjust here.
1501 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1502
1503 // Look forward to try to find a post-index instruction. For example,
1504 // ldr x1, [x0, #64]
1505 // add x0, x0, #64
1506 // merged into:
1507 // ldr x1, [x0, #64]!
1508 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
1509 if (Update != E) {
1510 // Merge the update into the ld/st.
1511 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1512 Modified = true;
1513 ++NumPreFolded;
1514 break;
1515 }
1516
1517 // Nothing found. Just move to the next instruction.
1518 ++MBBI;
1519 break;
1520 }
1521 // FIXME: Do the other instructions.
1522 }
1523 }
1524
1525 return Modified;
1526 }
1527
enableNarrowLdMerge(MachineFunction & Fn)1528 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1529 bool ProfitableArch = Subtarget->isCortexA57();
1530 // FIXME: The benefit from converting narrow loads into a wider load could be
1531 // microarchitectural as it assumes that a single load with two bitfield
1532 // extracts is cheaper than two narrow loads. Currently, this conversion is
1533 // enabled only in cortex-a57 on which performance benefits were verified.
1534 return ProfitableArch && !Subtarget->requiresStrictAlign();
1535 }
1536
runOnMachineFunction(MachineFunction & Fn)1537 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1538 Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1539 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1540 TRI = Subtarget->getRegisterInfo();
1541
1542 bool Modified = false;
1543 bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
1544 for (auto &MBB : Fn)
1545 Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1546
1547 return Modified;
1548 }
1549
1550 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1551 // loads and stores near one another?
1552
1553 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1554 /// load / store optimization pass.
createAArch64LoadStoreOptimizationPass()1555 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1556 return new AArch64LoadStoreOpt();
1557 }
1558