1//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains a pass that performs load / store related peephole
11// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AArch64InstrInfo.h"
16#include "AArch64Subtarget.h"
17#include "MCTargetDesc/AArch64AddressingModes.h"
18#include "llvm/ADT/BitVector.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/ADT/Statistic.h"
21#include "llvm/CodeGen/MachineBasicBlock.h"
22#include "llvm/CodeGen/MachineFunctionPass.h"
23#include "llvm/CodeGen/MachineInstr.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/Debug.h"
27#include "llvm/Support/ErrorHandling.h"
28#include "llvm/Support/raw_ostream.h"
29#include "llvm/Target/TargetInstrInfo.h"
30#include "llvm/Target/TargetMachine.h"
31#include "llvm/Target/TargetRegisterInfo.h"
32using namespace llvm;
33
34#define DEBUG_TYPE "aarch64-ldst-opt"
35
36/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
37/// load / store instructions to form ldp / stp instructions.
38
39STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
40STATISTIC(NumPostFolded, "Number of post-index updates folded");
41STATISTIC(NumPreFolded, "Number of pre-index updates folded");
42STATISTIC(NumUnscaledPairCreated,
43          "Number of load/store from unscaled generated");
44STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
45STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
46
47static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
48                                   cl::init(20), cl::Hidden);
49
50namespace llvm {
51void initializeAArch64LoadStoreOptPass(PassRegistry &);
52}
53
54#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
55
56namespace {
57
58typedef struct LdStPairFlags {
59  // If a matching instruction is found, MergeForward is set to true if the
60  // merge is to remove the first instruction and replace the second with
61  // a pair-wise insn, and false if the reverse is true.
62  bool MergeForward;
63
64  // SExtIdx gives the index of the result of the load pair that must be
65  // extended. The value of SExtIdx assumes that the paired load produces the
66  // value in this order: (I, returned iterator), i.e., -1 means no value has
67  // to be extended, 0 means I, and 1 means the returned iterator.
68  int SExtIdx;
69
70  LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
71
72  void setMergeForward(bool V = true) { MergeForward = V; }
73  bool getMergeForward() const { return MergeForward; }
74
75  void setSExtIdx(int V) { SExtIdx = V; }
76  int getSExtIdx() const { return SExtIdx; }
77
78} LdStPairFlags;
79
80struct AArch64LoadStoreOpt : public MachineFunctionPass {
81  static char ID;
82  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
83    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
84  }
85
86  const AArch64InstrInfo *TII;
87  const TargetRegisterInfo *TRI;
88  const AArch64Subtarget *Subtarget;
89
90  // Scan the instructions looking for a load/store that can be combined
91  // with the current instruction into a load/store pair.
92  // Return the matching instruction if one is found, else MBB->end().
93  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
94                                               LdStPairFlags &Flags,
95                                               unsigned Limit);
96  // Merge the two instructions indicated into a single pair-wise instruction.
97  // If MergeForward is true, erase the first instruction and fold its
98  // operation into the second. If false, the reverse. Return the instruction
99  // following the first instruction (which may change during processing).
100  MachineBasicBlock::iterator
101  mergePairedInsns(MachineBasicBlock::iterator I,
102                   MachineBasicBlock::iterator Paired,
103                   const LdStPairFlags &Flags);
104
105  // Scan the instruction list to find a base register update that can
106  // be combined with the current instruction (a load or store) using
107  // pre or post indexed addressing with writeback. Scan forwards.
108  MachineBasicBlock::iterator
109  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
110                                int UnscaledOffset);
111
112  // Scan the instruction list to find a base register update that can
113  // be combined with the current instruction (a load or store) using
114  // pre or post indexed addressing with writeback. Scan backwards.
115  MachineBasicBlock::iterator
116  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
117
118  // Find an instruction that updates the base register of the ld/st
119  // instruction.
120  bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
121                            unsigned BaseReg, int Offset);
122
123  // Merge a pre- or post-index base register update into a ld/st instruction.
124  MachineBasicBlock::iterator
125  mergeUpdateInsn(MachineBasicBlock::iterator I,
126                  MachineBasicBlock::iterator Update, bool IsPreIdx);
127
128  // Find and merge foldable ldr/str instructions.
129  bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
130
131  // Check if converting two narrow loads into a single wider load with
132  // bitfield extracts could be enabled.
133  bool enableNarrowLdMerge(MachineFunction &Fn);
134
135  bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
136
137  bool runOnMachineFunction(MachineFunction &Fn) override;
138
139  const char *getPassName() const override {
140    return AARCH64_LOAD_STORE_OPT_NAME;
141  }
142};
143char AArch64LoadStoreOpt::ID = 0;
144} // namespace
145
146INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
147                AARCH64_LOAD_STORE_OPT_NAME, false, false)
148
149static bool isUnscaledLdSt(unsigned Opc) {
150  switch (Opc) {
151  default:
152    return false;
153  case AArch64::STURSi:
154  case AArch64::STURDi:
155  case AArch64::STURQi:
156  case AArch64::STURBBi:
157  case AArch64::STURHHi:
158  case AArch64::STURWi:
159  case AArch64::STURXi:
160  case AArch64::LDURSi:
161  case AArch64::LDURDi:
162  case AArch64::LDURQi:
163  case AArch64::LDURWi:
164  case AArch64::LDURXi:
165  case AArch64::LDURSWi:
166  case AArch64::LDURHHi:
167  case AArch64::LDURBBi:
168  case AArch64::LDURSBWi:
169  case AArch64::LDURSHWi:
170    return true;
171  }
172}
173
174static bool isUnscaledLdSt(MachineInstr *MI) {
175  return isUnscaledLdSt(MI->getOpcode());
176}
177
178static unsigned getBitExtrOpcode(MachineInstr *MI) {
179  switch (MI->getOpcode()) {
180  default:
181    llvm_unreachable("Unexpected opcode.");
182  case AArch64::LDRBBui:
183  case AArch64::LDURBBi:
184  case AArch64::LDRHHui:
185  case AArch64::LDURHHi:
186    return AArch64::UBFMWri;
187  case AArch64::LDRSBWui:
188  case AArch64::LDURSBWi:
189  case AArch64::LDRSHWui:
190  case AArch64::LDURSHWi:
191    return AArch64::SBFMWri;
192  }
193}
194
195static bool isNarrowStore(unsigned Opc) {
196  switch (Opc) {
197  default:
198    return false;
199  case AArch64::STRBBui:
200  case AArch64::STURBBi:
201  case AArch64::STRHHui:
202  case AArch64::STURHHi:
203    return true;
204  }
205}
206
207static bool isNarrowStore(MachineInstr *MI) {
208  return isNarrowStore(MI->getOpcode());
209}
210
211static bool isNarrowLoad(unsigned Opc) {
212  switch (Opc) {
213  default:
214    return false;
215  case AArch64::LDRHHui:
216  case AArch64::LDURHHi:
217  case AArch64::LDRBBui:
218  case AArch64::LDURBBi:
219  case AArch64::LDRSHWui:
220  case AArch64::LDURSHWi:
221  case AArch64::LDRSBWui:
222  case AArch64::LDURSBWi:
223    return true;
224  }
225}
226
227static bool isNarrowLoad(MachineInstr *MI) {
228  return isNarrowLoad(MI->getOpcode());
229}
230
231// Scaling factor for unscaled load or store.
232static int getMemScale(MachineInstr *MI) {
233  switch (MI->getOpcode()) {
234  default:
235    llvm_unreachable("Opcode has unknown scale!");
236  case AArch64::LDRBBui:
237  case AArch64::LDURBBi:
238  case AArch64::LDRSBWui:
239  case AArch64::LDURSBWi:
240  case AArch64::STRBBui:
241  case AArch64::STURBBi:
242    return 1;
243  case AArch64::LDRHHui:
244  case AArch64::LDURHHi:
245  case AArch64::LDRSHWui:
246  case AArch64::LDURSHWi:
247  case AArch64::STRHHui:
248  case AArch64::STURHHi:
249    return 2;
250  case AArch64::LDRSui:
251  case AArch64::LDURSi:
252  case AArch64::LDRSWui:
253  case AArch64::LDURSWi:
254  case AArch64::LDRWui:
255  case AArch64::LDURWi:
256  case AArch64::STRSui:
257  case AArch64::STURSi:
258  case AArch64::STRWui:
259  case AArch64::STURWi:
260  case AArch64::LDPSi:
261  case AArch64::LDPSWi:
262  case AArch64::LDPWi:
263  case AArch64::STPSi:
264  case AArch64::STPWi:
265    return 4;
266  case AArch64::LDRDui:
267  case AArch64::LDURDi:
268  case AArch64::LDRXui:
269  case AArch64::LDURXi:
270  case AArch64::STRDui:
271  case AArch64::STURDi:
272  case AArch64::STRXui:
273  case AArch64::STURXi:
274  case AArch64::LDPDi:
275  case AArch64::LDPXi:
276  case AArch64::STPDi:
277  case AArch64::STPXi:
278    return 8;
279  case AArch64::LDRQui:
280  case AArch64::LDURQi:
281  case AArch64::STRQui:
282  case AArch64::STURQi:
283  case AArch64::LDPQi:
284  case AArch64::STPQi:
285    return 16;
286  }
287}
288
289static unsigned getMatchingNonSExtOpcode(unsigned Opc,
290                                         bool *IsValidLdStrOpc = nullptr) {
291  if (IsValidLdStrOpc)
292    *IsValidLdStrOpc = true;
293  switch (Opc) {
294  default:
295    if (IsValidLdStrOpc)
296      *IsValidLdStrOpc = false;
297    return UINT_MAX;
298  case AArch64::STRDui:
299  case AArch64::STURDi:
300  case AArch64::STRQui:
301  case AArch64::STURQi:
302  case AArch64::STRBBui:
303  case AArch64::STURBBi:
304  case AArch64::STRHHui:
305  case AArch64::STURHHi:
306  case AArch64::STRWui:
307  case AArch64::STURWi:
308  case AArch64::STRXui:
309  case AArch64::STURXi:
310  case AArch64::LDRDui:
311  case AArch64::LDURDi:
312  case AArch64::LDRQui:
313  case AArch64::LDURQi:
314  case AArch64::LDRWui:
315  case AArch64::LDURWi:
316  case AArch64::LDRXui:
317  case AArch64::LDURXi:
318  case AArch64::STRSui:
319  case AArch64::STURSi:
320  case AArch64::LDRSui:
321  case AArch64::LDURSi:
322  case AArch64::LDRHHui:
323  case AArch64::LDURHHi:
324  case AArch64::LDRBBui:
325  case AArch64::LDURBBi:
326    return Opc;
327  case AArch64::LDRSWui:
328    return AArch64::LDRWui;
329  case AArch64::LDURSWi:
330    return AArch64::LDURWi;
331  case AArch64::LDRSBWui:
332    return AArch64::LDRBBui;
333  case AArch64::LDRSHWui:
334    return AArch64::LDRHHui;
335  case AArch64::LDURSBWi:
336    return AArch64::LDURBBi;
337  case AArch64::LDURSHWi:
338    return AArch64::LDURHHi;
339  }
340}
341
342static unsigned getMatchingPairOpcode(unsigned Opc) {
343  switch (Opc) {
344  default:
345    llvm_unreachable("Opcode has no pairwise equivalent!");
346  case AArch64::STRSui:
347  case AArch64::STURSi:
348    return AArch64::STPSi;
349  case AArch64::STRDui:
350  case AArch64::STURDi:
351    return AArch64::STPDi;
352  case AArch64::STRQui:
353  case AArch64::STURQi:
354    return AArch64::STPQi;
355  case AArch64::STRBBui:
356    return AArch64::STRHHui;
357  case AArch64::STRHHui:
358    return AArch64::STRWui;
359  case AArch64::STURBBi:
360    return AArch64::STURHHi;
361  case AArch64::STURHHi:
362    return AArch64::STURWi;
363  case AArch64::STRWui:
364  case AArch64::STURWi:
365    return AArch64::STPWi;
366  case AArch64::STRXui:
367  case AArch64::STURXi:
368    return AArch64::STPXi;
369  case AArch64::LDRSui:
370  case AArch64::LDURSi:
371    return AArch64::LDPSi;
372  case AArch64::LDRDui:
373  case AArch64::LDURDi:
374    return AArch64::LDPDi;
375  case AArch64::LDRQui:
376  case AArch64::LDURQi:
377    return AArch64::LDPQi;
378  case AArch64::LDRWui:
379  case AArch64::LDURWi:
380    return AArch64::LDPWi;
381  case AArch64::LDRXui:
382  case AArch64::LDURXi:
383    return AArch64::LDPXi;
384  case AArch64::LDRSWui:
385  case AArch64::LDURSWi:
386    return AArch64::LDPSWi;
387  case AArch64::LDRHHui:
388  case AArch64::LDRSHWui:
389    return AArch64::LDRWui;
390  case AArch64::LDURHHi:
391  case AArch64::LDURSHWi:
392    return AArch64::LDURWi;
393  case AArch64::LDRBBui:
394  case AArch64::LDRSBWui:
395    return AArch64::LDRHHui;
396  case AArch64::LDURBBi:
397  case AArch64::LDURSBWi:
398    return AArch64::LDURHHi;
399  }
400}
401
402static unsigned getPreIndexedOpcode(unsigned Opc) {
403  switch (Opc) {
404  default:
405    llvm_unreachable("Opcode has no pre-indexed equivalent!");
406  case AArch64::STRSui:
407    return AArch64::STRSpre;
408  case AArch64::STRDui:
409    return AArch64::STRDpre;
410  case AArch64::STRQui:
411    return AArch64::STRQpre;
412  case AArch64::STRBBui:
413    return AArch64::STRBBpre;
414  case AArch64::STRHHui:
415    return AArch64::STRHHpre;
416  case AArch64::STRWui:
417    return AArch64::STRWpre;
418  case AArch64::STRXui:
419    return AArch64::STRXpre;
420  case AArch64::LDRSui:
421    return AArch64::LDRSpre;
422  case AArch64::LDRDui:
423    return AArch64::LDRDpre;
424  case AArch64::LDRQui:
425    return AArch64::LDRQpre;
426  case AArch64::LDRBBui:
427    return AArch64::LDRBBpre;
428  case AArch64::LDRHHui:
429    return AArch64::LDRHHpre;
430  case AArch64::LDRWui:
431    return AArch64::LDRWpre;
432  case AArch64::LDRXui:
433    return AArch64::LDRXpre;
434  case AArch64::LDRSWui:
435    return AArch64::LDRSWpre;
436  case AArch64::LDPSi:
437    return AArch64::LDPSpre;
438  case AArch64::LDPSWi:
439    return AArch64::LDPSWpre;
440  case AArch64::LDPDi:
441    return AArch64::LDPDpre;
442  case AArch64::LDPQi:
443    return AArch64::LDPQpre;
444  case AArch64::LDPWi:
445    return AArch64::LDPWpre;
446  case AArch64::LDPXi:
447    return AArch64::LDPXpre;
448  case AArch64::STPSi:
449    return AArch64::STPSpre;
450  case AArch64::STPDi:
451    return AArch64::STPDpre;
452  case AArch64::STPQi:
453    return AArch64::STPQpre;
454  case AArch64::STPWi:
455    return AArch64::STPWpre;
456  case AArch64::STPXi:
457    return AArch64::STPXpre;
458  }
459}
460
461static unsigned getPostIndexedOpcode(unsigned Opc) {
462  switch (Opc) {
463  default:
464    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
465  case AArch64::STRSui:
466    return AArch64::STRSpost;
467  case AArch64::STRDui:
468    return AArch64::STRDpost;
469  case AArch64::STRQui:
470    return AArch64::STRQpost;
471  case AArch64::STRBBui:
472    return AArch64::STRBBpost;
473  case AArch64::STRHHui:
474    return AArch64::STRHHpost;
475  case AArch64::STRWui:
476    return AArch64::STRWpost;
477  case AArch64::STRXui:
478    return AArch64::STRXpost;
479  case AArch64::LDRSui:
480    return AArch64::LDRSpost;
481  case AArch64::LDRDui:
482    return AArch64::LDRDpost;
483  case AArch64::LDRQui:
484    return AArch64::LDRQpost;
485  case AArch64::LDRBBui:
486    return AArch64::LDRBBpost;
487  case AArch64::LDRHHui:
488    return AArch64::LDRHHpost;
489  case AArch64::LDRWui:
490    return AArch64::LDRWpost;
491  case AArch64::LDRXui:
492    return AArch64::LDRXpost;
493  case AArch64::LDRSWui:
494    return AArch64::LDRSWpost;
495  case AArch64::LDPSi:
496    return AArch64::LDPSpost;
497  case AArch64::LDPSWi:
498    return AArch64::LDPSWpost;
499  case AArch64::LDPDi:
500    return AArch64::LDPDpost;
501  case AArch64::LDPQi:
502    return AArch64::LDPQpost;
503  case AArch64::LDPWi:
504    return AArch64::LDPWpost;
505  case AArch64::LDPXi:
506    return AArch64::LDPXpost;
507  case AArch64::STPSi:
508    return AArch64::STPSpost;
509  case AArch64::STPDi:
510    return AArch64::STPDpost;
511  case AArch64::STPQi:
512    return AArch64::STPQpost;
513  case AArch64::STPWi:
514    return AArch64::STPWpost;
515  case AArch64::STPXi:
516    return AArch64::STPXpost;
517  }
518}
519
520static bool isPairedLdSt(const MachineInstr *MI) {
521  switch (MI->getOpcode()) {
522  default:
523    return false;
524  case AArch64::LDPSi:
525  case AArch64::LDPSWi:
526  case AArch64::LDPDi:
527  case AArch64::LDPQi:
528  case AArch64::LDPWi:
529  case AArch64::LDPXi:
530  case AArch64::STPSi:
531  case AArch64::STPDi:
532  case AArch64::STPQi:
533  case AArch64::STPWi:
534  case AArch64::STPXi:
535    return true;
536  }
537}
538
539static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
540                                          unsigned PairedRegOp = 0) {
541  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
542  unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
543  return MI->getOperand(Idx);
544}
545
546static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
547  unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
548  return MI->getOperand(Idx);
549}
550
551static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
552  unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
553  return MI->getOperand(Idx);
554}
555
556// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
557static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
558                                   MachineInstr *Op1) {
559  assert(MI->memoperands_empty() && "expected a new machineinstr");
560  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
561                      (Op1->memoperands_end() - Op1->memoperands_begin());
562
563  MachineFunction *MF = MI->getParent()->getParent();
564  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
565  MachineSDNode::mmo_iterator MemEnd =
566      std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
567  MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
568  MI->setMemRefs(MemBegin, MemEnd);
569}
570
571MachineBasicBlock::iterator
572AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
573                                      MachineBasicBlock::iterator Paired,
574                                      const LdStPairFlags &Flags) {
575  MachineBasicBlock::iterator NextI = I;
576  ++NextI;
577  // If NextI is the second of the two instructions to be merged, we need
578  // to skip one further. Either way we merge will invalidate the iterator,
579  // and we don't need to scan the new instruction, as it's a pairwise
580  // instruction, which we're not considering for further action anyway.
581  if (NextI == Paired)
582    ++NextI;
583
584  int SExtIdx = Flags.getSExtIdx();
585  unsigned Opc =
586      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
587  bool IsUnscaled = isUnscaledLdSt(Opc);
588  int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
589
590  bool MergeForward = Flags.getMergeForward();
591  unsigned NewOpc = getMatchingPairOpcode(Opc);
592  // Insert our new paired instruction after whichever of the paired
593  // instructions MergeForward indicates.
594  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
595  // Also based on MergeForward is from where we copy the base register operand
596  // so we get the flags compatible with the input code.
597  const MachineOperand &BaseRegOp =
598      MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
599
600  // Which register is Rt and which is Rt2 depends on the offset order.
601  MachineInstr *RtMI, *Rt2MI;
602  if (getLdStOffsetOp(I).getImm() ==
603      getLdStOffsetOp(Paired).getImm() + OffsetStride) {
604    RtMI = Paired;
605    Rt2MI = I;
606    // Here we swapped the assumption made for SExtIdx.
607    // I.e., we turn ldp I, Paired into ldp Paired, I.
608    // Update the index accordingly.
609    if (SExtIdx != -1)
610      SExtIdx = (SExtIdx + 1) % 2;
611  } else {
612    RtMI = I;
613    Rt2MI = Paired;
614  }
615
616  int OffsetImm = getLdStOffsetOp(RtMI).getImm();
617
618  if (isNarrowLoad(Opc)) {
619    // Change the scaled offset from small to large type.
620    if (!IsUnscaled) {
621      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
622      OffsetImm /= 2;
623    }
624    MachineInstr *RtNewDest = MergeForward ? I : Paired;
625    // When merging small (< 32 bit) loads for big-endian targets, the order of
626    // the component parts gets swapped.
627    if (!Subtarget->isLittleEndian())
628      std::swap(RtMI, Rt2MI);
629    // Construct the new load instruction.
630    MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
631    NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
632                       TII->get(NewOpc))
633                   .addOperand(getLdStRegOp(RtNewDest))
634                   .addOperand(BaseRegOp)
635                   .addImm(OffsetImm);
636
637    // Copy MachineMemOperands from the original loads.
638    concatenateMemOperands(NewMemMI, I, Paired);
639
640    DEBUG(
641        dbgs()
642        << "Creating the new load and extract. Replacing instructions:\n    ");
643    DEBUG(I->print(dbgs()));
644    DEBUG(dbgs() << "    ");
645    DEBUG(Paired->print(dbgs()));
646    DEBUG(dbgs() << "  with instructions:\n    ");
647    DEBUG((NewMemMI)->print(dbgs()));
648
649    int Width = getMemScale(I) == 1 ? 8 : 16;
650    int LSBLow = 0;
651    int LSBHigh = Width;
652    int ImmsLow = LSBLow + Width - 1;
653    int ImmsHigh = LSBHigh + Width - 1;
654    MachineInstr *ExtDestMI = MergeForward ? Paired : I;
655    if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
656      // Create the bitfield extract for high bits.
657      BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
658                          TII->get(getBitExtrOpcode(Rt2MI)))
659                      .addOperand(getLdStRegOp(Rt2MI))
660                      .addReg(getLdStRegOp(RtNewDest).getReg())
661                      .addImm(LSBHigh)
662                      .addImm(ImmsHigh);
663      // Create the bitfield extract for low bits.
664      if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
665        // For unsigned, prefer to use AND for low bits.
666        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
667                            TII->get(AArch64::ANDWri))
668                        .addOperand(getLdStRegOp(RtMI))
669                        .addReg(getLdStRegOp(RtNewDest).getReg())
670                        .addImm(ImmsLow);
671      } else {
672        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
673                            TII->get(getBitExtrOpcode(RtMI)))
674                        .addOperand(getLdStRegOp(RtMI))
675                        .addReg(getLdStRegOp(RtNewDest).getReg())
676                        .addImm(LSBLow)
677                        .addImm(ImmsLow);
678      }
679    } else {
680      // Create the bitfield extract for low bits.
681      if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
682        // For unsigned, prefer to use AND for low bits.
683        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
684                            TII->get(AArch64::ANDWri))
685                        .addOperand(getLdStRegOp(RtMI))
686                        .addReg(getLdStRegOp(RtNewDest).getReg())
687                        .addImm(ImmsLow);
688      } else {
689        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
690                            TII->get(getBitExtrOpcode(RtMI)))
691                        .addOperand(getLdStRegOp(RtMI))
692                        .addReg(getLdStRegOp(RtNewDest).getReg())
693                        .addImm(LSBLow)
694                        .addImm(ImmsLow);
695      }
696
697      // Create the bitfield extract for high bits.
698      BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
699                          TII->get(getBitExtrOpcode(Rt2MI)))
700                      .addOperand(getLdStRegOp(Rt2MI))
701                      .addReg(getLdStRegOp(RtNewDest).getReg())
702                      .addImm(LSBHigh)
703                      .addImm(ImmsHigh);
704    }
705    DEBUG(dbgs() << "    ");
706    DEBUG((BitExtMI1)->print(dbgs()));
707    DEBUG(dbgs() << "    ");
708    DEBUG((BitExtMI2)->print(dbgs()));
709    DEBUG(dbgs() << "\n");
710
711    // Erase the old instructions.
712    I->eraseFromParent();
713    Paired->eraseFromParent();
714    return NextI;
715  }
716
717  // Construct the new instruction.
718  MachineInstrBuilder MIB;
719  if (isNarrowStore(Opc)) {
720    // Change the scaled offset from small to large type.
721    if (!IsUnscaled) {
722      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
723      OffsetImm /= 2;
724    }
725    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
726                  TII->get(NewOpc))
727              .addOperand(getLdStRegOp(I))
728              .addOperand(BaseRegOp)
729              .addImm(OffsetImm);
730    // Copy MachineMemOperands from the original stores.
731    concatenateMemOperands(MIB, I, Paired);
732  } else {
733    // Handle Unscaled
734    if (IsUnscaled)
735      OffsetImm /= OffsetStride;
736    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
737                  TII->get(NewOpc))
738              .addOperand(getLdStRegOp(RtMI))
739              .addOperand(getLdStRegOp(Rt2MI))
740              .addOperand(BaseRegOp)
741              .addImm(OffsetImm);
742  }
743
744  (void)MIB;
745
746  // FIXME: Do we need/want to copy the mem operands from the source
747  //        instructions? Probably. What uses them after this?
748
749  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
750  DEBUG(I->print(dbgs()));
751  DEBUG(dbgs() << "    ");
752  DEBUG(Paired->print(dbgs()));
753  DEBUG(dbgs() << "  with instruction:\n    ");
754
755  if (SExtIdx != -1) {
756    // Generate the sign extension for the proper result of the ldp.
757    // I.e., with X1, that would be:
758    // %W1<def> = KILL %W1, %X1<imp-def>
759    // %X1<def> = SBFMXri %X1<kill>, 0, 31
760    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
761    // Right now, DstMO has the extended register, since it comes from an
762    // extended opcode.
763    unsigned DstRegX = DstMO.getReg();
764    // Get the W variant of that register.
765    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
766    // Update the result of LDP to use the W instead of the X variant.
767    DstMO.setReg(DstRegW);
768    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
769    DEBUG(dbgs() << "\n");
770    // Make the machine verifier happy by providing a definition for
771    // the X register.
772    // Insert this definition right after the generated LDP, i.e., before
773    // InsertionPoint.
774    MachineInstrBuilder MIBKill =
775        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
776                TII->get(TargetOpcode::KILL), DstRegW)
777            .addReg(DstRegW)
778            .addReg(DstRegX, RegState::Define);
779    MIBKill->getOperand(2).setImplicit();
780    // Create the sign extension.
781    MachineInstrBuilder MIBSXTW =
782        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
783                TII->get(AArch64::SBFMXri), DstRegX)
784            .addReg(DstRegX)
785            .addImm(0)
786            .addImm(31);
787    (void)MIBSXTW;
788    DEBUG(dbgs() << "  Extend operand:\n    ");
789    DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
790    DEBUG(dbgs() << "\n");
791  } else {
792    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
793    DEBUG(dbgs() << "\n");
794  }
795
796  // Erase the old instructions.
797  I->eraseFromParent();
798  Paired->eraseFromParent();
799
800  return NextI;
801}
802
803/// trackRegDefsUses - Remember what registers the specified instruction uses
804/// and modifies.
805static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
806                             BitVector &UsedRegs,
807                             const TargetRegisterInfo *TRI) {
808  for (const MachineOperand &MO : MI->operands()) {
809    if (MO.isRegMask())
810      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
811
812    if (!MO.isReg())
813      continue;
814    unsigned Reg = MO.getReg();
815    if (MO.isDef()) {
816      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
817        ModifiedRegs.set(*AI);
818    } else {
819      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
820      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
821        UsedRegs.set(*AI);
822    }
823  }
824}
825
826static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
827  // Convert the byte-offset used by unscaled into an "element" offset used
828  // by the scaled pair load/store instructions.
829  if (IsUnscaled)
830    Offset /= OffsetStride;
831
832  return Offset <= 63 && Offset >= -64;
833}
834
835// Do alignment, specialized to power of 2 and for signed ints,
836// avoiding having to do a C-style cast from uint_64t to int when
837// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
838// FIXME: Move this function to include/MathExtras.h?
839static int alignTo(int Num, int PowOf2) {
840  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
841}
842
843static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
844                     const AArch64InstrInfo *TII) {
845  // One of the instructions must modify memory.
846  if (!MIa->mayStore() && !MIb->mayStore())
847    return false;
848
849  // Both instructions must be memory operations.
850  if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
851    return false;
852
853  return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
854}
855
856static bool mayAlias(MachineInstr *MIa,
857                     SmallVectorImpl<MachineInstr *> &MemInsns,
858                     const AArch64InstrInfo *TII) {
859  for (auto &MIb : MemInsns)
860    if (mayAlias(MIa, MIb, TII))
861      return true;
862
863  return false;
864}
865
866/// findMatchingInsn - Scan the instructions looking for a load/store that can
867/// be combined with the current instruction into a load/store pair.
868MachineBasicBlock::iterator
869AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
870                                      LdStPairFlags &Flags, unsigned Limit) {
871  MachineBasicBlock::iterator E = I->getParent()->end();
872  MachineBasicBlock::iterator MBBI = I;
873  MachineInstr *FirstMI = I;
874  ++MBBI;
875
876  unsigned Opc = FirstMI->getOpcode();
877  bool MayLoad = FirstMI->mayLoad();
878  bool IsUnscaled = isUnscaledLdSt(FirstMI);
879  unsigned Reg = getLdStRegOp(FirstMI).getReg();
880  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
881  int Offset = getLdStOffsetOp(FirstMI).getImm();
882  bool IsNarrowStore = isNarrowStore(Opc);
883
884  // For narrow stores, find only the case where the stored value is WZR.
885  if (IsNarrowStore && Reg != AArch64::WZR)
886    return E;
887
888  // Early exit if the first instruction modifies the base register.
889  // e.g., ldr x0, [x0]
890  if (FirstMI->modifiesRegister(BaseReg, TRI))
891    return E;
892
893  // Early exit if the offset if not possible to match. (6 bits of positive
894  // range, plus allow an extra one in case we find a later insn that matches
895  // with Offset-1)
896  int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
897  if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
898      !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
899    return E;
900
901  // Track which registers have been modified and used between the first insn
902  // (inclusive) and the second insn.
903  BitVector ModifiedRegs, UsedRegs;
904  ModifiedRegs.resize(TRI->getNumRegs());
905  UsedRegs.resize(TRI->getNumRegs());
906
907  // Remember any instructions that read/write memory between FirstMI and MI.
908  SmallVector<MachineInstr *, 4> MemInsns;
909
910  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
911    MachineInstr *MI = MBBI;
912    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
913    // optimization by changing how far we scan.
914    if (MI->isDebugValue())
915      continue;
916
917    // Now that we know this is a real instruction, count it.
918    ++Count;
919
920    bool CanMergeOpc = Opc == MI->getOpcode();
921    Flags.setSExtIdx(-1);
922    if (!CanMergeOpc) {
923      bool IsValidLdStrOpc;
924      unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
925      assert(IsValidLdStrOpc &&
926             "Given Opc should be a Load or Store with an immediate");
927      // Opc will be the first instruction in the pair.
928      Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
929      CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
930    }
931
932    if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
933      assert(MI->mayLoadOrStore() && "Expected memory operation.");
934      // If we've found another instruction with the same opcode, check to see
935      // if the base and offset are compatible with our starting instruction.
936      // These instructions all have scaled immediate operands, so we just
937      // check for +1/-1. Make sure to check the new instruction offset is
938      // actually an immediate and not a symbolic reference destined for
939      // a relocation.
940      //
941      // Pairwise instructions have a 7-bit signed offset field. Single insns
942      // have a 12-bit unsigned offset field. To be a valid combine, the
943      // final offset must be in range.
944      unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
945      int MIOffset = getLdStOffsetOp(MI).getImm();
946      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
947                                   (Offset + OffsetStride == MIOffset))) {
948        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
949        // If this is a volatile load/store that otherwise matched, stop looking
950        // as something is going on that we don't have enough information to
951        // safely transform. Similarly, stop if we see a hint to avoid pairs.
952        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
953          return E;
954        // If the resultant immediate offset of merging these instructions
955        // is out of range for a pairwise instruction, bail and keep looking.
956        bool MIIsUnscaled = isUnscaledLdSt(MI);
957        bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
958        if (!IsNarrowLoad &&
959            !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
960          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
961          MemInsns.push_back(MI);
962          continue;
963        }
964
965        if (IsNarrowLoad || IsNarrowStore) {
966          // If the alignment requirements of the scaled wide load/store
967          // instruction can't express the offset of the scaled narrow
968          // input, bail and keep looking.
969          if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
970            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
971            MemInsns.push_back(MI);
972            continue;
973          }
974        } else {
975          // If the alignment requirements of the paired (scaled) instruction
976          // can't express the offset of the unscaled input, bail and keep
977          // looking.
978          if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
979            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
980            MemInsns.push_back(MI);
981            continue;
982          }
983        }
984        // If the destination register of the loads is the same register, bail
985        // and keep looking. A load-pair instruction with both destination
986        // registers the same is UNPREDICTABLE and will result in an exception.
987        // For narrow stores, allow only when the stored value is the same
988        // (i.e., WZR).
989        if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
990            (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
991          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
992          MemInsns.push_back(MI);
993          continue;
994        }
995
996        // If the Rt of the second instruction was not modified or used between
997        // the two instructions and none of the instructions between the second
998        // and first alias with the second, we can combine the second into the
999        // first.
1000        if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
1001            !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
1002            !mayAlias(MI, MemInsns, TII)) {
1003          Flags.setMergeForward(false);
1004          return MBBI;
1005        }
1006
1007        // Likewise, if the Rt of the first instruction is not modified or used
1008        // between the two instructions and none of the instructions between the
1009        // first and the second alias with the first, we can combine the first
1010        // into the second.
1011        if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
1012            !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
1013            !mayAlias(FirstMI, MemInsns, TII)) {
1014          Flags.setMergeForward(true);
1015          return MBBI;
1016        }
1017        // Unable to combine these instructions due to interference in between.
1018        // Keep looking.
1019      }
1020    }
1021
1022    // If the instruction wasn't a matching load or store.  Stop searching if we
1023    // encounter a call instruction that might modify memory.
1024    if (MI->isCall())
1025      return E;
1026
1027    // Update modified / uses register lists.
1028    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1029
1030    // Otherwise, if the base register is modified, we have no match, so
1031    // return early.
1032    if (ModifiedRegs[BaseReg])
1033      return E;
1034
1035    // Update list of instructions that read/write memory.
1036    if (MI->mayLoadOrStore())
1037      MemInsns.push_back(MI);
1038  }
1039  return E;
1040}
1041
1042MachineBasicBlock::iterator
1043AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1044                                     MachineBasicBlock::iterator Update,
1045                                     bool IsPreIdx) {
1046  assert((Update->getOpcode() == AArch64::ADDXri ||
1047          Update->getOpcode() == AArch64::SUBXri) &&
1048         "Unexpected base register update instruction to merge!");
1049  MachineBasicBlock::iterator NextI = I;
1050  // Return the instruction following the merged instruction, which is
1051  // the instruction following our unmerged load. Unless that's the add/sub
1052  // instruction we're merging, in which case it's the one after that.
1053  if (++NextI == Update)
1054    ++NextI;
1055
1056  int Value = Update->getOperand(2).getImm();
1057  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1058         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1059  if (Update->getOpcode() == AArch64::SUBXri)
1060    Value = -Value;
1061
1062  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
1063                             : getPostIndexedOpcode(I->getOpcode());
1064  MachineInstrBuilder MIB;
1065  if (!isPairedLdSt(I)) {
1066    // Non-paired instruction.
1067    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1068              .addOperand(getLdStRegOp(Update))
1069              .addOperand(getLdStRegOp(I))
1070              .addOperand(getLdStBaseOp(I))
1071              .addImm(Value);
1072  } else {
1073    // Paired instruction.
1074    int Scale = getMemScale(I);
1075    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
1076              .addOperand(getLdStRegOp(Update))
1077              .addOperand(getLdStRegOp(I, 0))
1078              .addOperand(getLdStRegOp(I, 1))
1079              .addOperand(getLdStBaseOp(I))
1080              .addImm(Value / Scale);
1081  }
1082  (void)MIB;
1083
1084  if (IsPreIdx)
1085    DEBUG(dbgs() << "Creating pre-indexed load/store.");
1086  else
1087    DEBUG(dbgs() << "Creating post-indexed load/store.");
1088  DEBUG(dbgs() << "    Replacing instructions:\n    ");
1089  DEBUG(I->print(dbgs()));
1090  DEBUG(dbgs() << "    ");
1091  DEBUG(Update->print(dbgs()));
1092  DEBUG(dbgs() << "  with instruction:\n    ");
1093  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1094  DEBUG(dbgs() << "\n");
1095
1096  // Erase the old instructions for the block.
1097  I->eraseFromParent();
1098  Update->eraseFromParent();
1099
1100  return NextI;
1101}
1102
1103bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
1104                                               MachineInstr *MI,
1105                                               unsigned BaseReg, int Offset) {
1106  switch (MI->getOpcode()) {
1107  default:
1108    break;
1109  case AArch64::SUBXri:
1110    // Negate the offset for a SUB instruction.
1111    Offset *= -1;
1112  // FALLTHROUGH
1113  case AArch64::ADDXri:
1114    // Make sure it's a vanilla immediate operand, not a relocation or
1115    // anything else we can't handle.
1116    if (!MI->getOperand(2).isImm())
1117      break;
1118    // Watch out for 1 << 12 shifted value.
1119    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
1120      break;
1121
1122    // The update instruction source and destination register must be the
1123    // same as the load/store base register.
1124    if (MI->getOperand(0).getReg() != BaseReg ||
1125        MI->getOperand(1).getReg() != BaseReg)
1126      break;
1127
1128    bool IsPairedInsn = isPairedLdSt(MemMI);
1129    int UpdateOffset = MI->getOperand(2).getImm();
1130    // For non-paired load/store instructions, the immediate must fit in a
1131    // signed 9-bit integer.
1132    if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
1133      break;
1134
1135    // For paired load/store instructions, the immediate must be a multiple of
1136    // the scaling factor.  The scaled offset must also fit into a signed 7-bit
1137    // integer.
1138    if (IsPairedInsn) {
1139      int Scale = getMemScale(MemMI);
1140      if (UpdateOffset % Scale != 0)
1141        break;
1142
1143      int ScaledOffset = UpdateOffset / Scale;
1144      if (ScaledOffset > 64 || ScaledOffset < -64)
1145        break;
1146    }
1147
1148    // If we have a non-zero Offset, we check that it matches the amount
1149    // we're adding to the register.
1150    if (!Offset || Offset == MI->getOperand(2).getImm())
1151      return true;
1152    break;
1153  }
1154  return false;
1155}
1156
1157MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
1158    MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
1159  MachineBasicBlock::iterator E = I->getParent()->end();
1160  MachineInstr *MemMI = I;
1161  MachineBasicBlock::iterator MBBI = I;
1162
1163  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1164  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
1165
1166  // Scan forward looking for post-index opportunities.  Updating instructions
1167  // can't be formed if the memory instruction doesn't have the offset we're
1168  // looking for.
1169  if (MIUnscaledOffset != UnscaledOffset)
1170    return E;
1171
1172  // If the base register overlaps a destination register, we can't
1173  // merge the update.
1174  bool IsPairedInsn = isPairedLdSt(MemMI);
1175  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1176    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1177    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1178      return E;
1179  }
1180
1181  // Track which registers have been modified and used between the first insn
1182  // (inclusive) and the second insn.
1183  BitVector ModifiedRegs, UsedRegs;
1184  ModifiedRegs.resize(TRI->getNumRegs());
1185  UsedRegs.resize(TRI->getNumRegs());
1186  ++MBBI;
1187  for (unsigned Count = 0; MBBI != E; ++MBBI) {
1188    MachineInstr *MI = MBBI;
1189    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1190    // optimization by changing how far we scan.
1191    if (MI->isDebugValue())
1192      continue;
1193
1194    // Now that we know this is a real instruction, count it.
1195    ++Count;
1196
1197    // If we found a match, return it.
1198    if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
1199      return MBBI;
1200
1201    // Update the status of what the instruction clobbered and used.
1202    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1203
1204    // Otherwise, if the base register is used or modified, we have no match, so
1205    // return early.
1206    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1207      return E;
1208  }
1209  return E;
1210}
1211
1212MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
1213    MachineBasicBlock::iterator I, unsigned Limit) {
1214  MachineBasicBlock::iterator B = I->getParent()->begin();
1215  MachineBasicBlock::iterator E = I->getParent()->end();
1216  MachineInstr *MemMI = I;
1217  MachineBasicBlock::iterator MBBI = I;
1218
1219  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
1220  int Offset = getLdStOffsetOp(MemMI).getImm();
1221
1222  // If the load/store is the first instruction in the block, there's obviously
1223  // not any matching update. Ditto if the memory offset isn't zero.
1224  if (MBBI == B || Offset != 0)
1225    return E;
1226  // If the base register overlaps a destination register, we can't
1227  // merge the update.
1228  bool IsPairedInsn = isPairedLdSt(MemMI);
1229  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
1230    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
1231    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
1232      return E;
1233  }
1234
1235  // Track which registers have been modified and used between the first insn
1236  // (inclusive) and the second insn.
1237  BitVector ModifiedRegs, UsedRegs;
1238  ModifiedRegs.resize(TRI->getNumRegs());
1239  UsedRegs.resize(TRI->getNumRegs());
1240  --MBBI;
1241  for (unsigned Count = 0; MBBI != B; --MBBI) {
1242    MachineInstr *MI = MBBI;
1243    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
1244    // optimization by changing how far we scan.
1245    if (MI->isDebugValue())
1246      continue;
1247
1248    // Now that we know this is a real instruction, count it.
1249    ++Count;
1250
1251    // If we found a match, return it.
1252    if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
1253      return MBBI;
1254
1255    // Update the status of what the instruction clobbered and used.
1256    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
1257
1258    // Otherwise, if the base register is used or modified, we have no match, so
1259    // return early.
1260    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1261      return E;
1262  }
1263  return E;
1264}
1265
1266bool AArch64LoadStoreOpt::tryToMergeLdStInst(
1267    MachineBasicBlock::iterator &MBBI) {
1268  MachineInstr *MI = MBBI;
1269  MachineBasicBlock::iterator E = MI->getParent()->end();
1270  // If this is a volatile load/store, don't mess with it.
1271  if (MI->hasOrderedMemoryRef())
1272    return false;
1273
1274  // Make sure this is a reg+imm (as opposed to an address reloc).
1275  if (!getLdStOffsetOp(MI).isImm())
1276    return false;
1277
1278  // Check if this load/store has a hint to avoid pair formation.
1279  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1280  if (TII->isLdStPairSuppressed(MI))
1281    return false;
1282
1283  // Look ahead up to ScanLimit instructions for a pairable instruction.
1284  LdStPairFlags Flags;
1285  MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
1286  if (Paired != E) {
1287    if (isNarrowLoad(MI)) {
1288      ++NumNarrowLoadsPromoted;
1289    } else if (isNarrowStore(MI)) {
1290      ++NumZeroStoresPromoted;
1291    } else {
1292      ++NumPairCreated;
1293      if (isUnscaledLdSt(MI))
1294        ++NumUnscaledPairCreated;
1295    }
1296
1297    // Merge the loads into a pair. Keeping the iterator straight is a
1298    // pain, so we let the merge routine tell us what the next instruction
1299    // is after it's done mucking about.
1300    MBBI = mergePairedInsns(MBBI, Paired, Flags);
1301    return true;
1302  }
1303  return false;
1304}
1305
1306bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
1307                                        bool enableNarrowLdOpt) {
1308  bool Modified = false;
1309  // Three tranformations to do here:
1310  // 1) Find narrow loads that can be converted into a single wider load
1311  //    with bitfield extract instructions.
1312  //      e.g.,
1313  //        ldrh w0, [x2]
1314  //        ldrh w1, [x2, #2]
1315  //        ; becomes
1316  //        ldr w0, [x2]
1317  //        ubfx w1, w0, #16, #16
1318  //        and w0, w0, #ffff
1319  // 2) Find loads and stores that can be merged into a single load or store
1320  //    pair instruction.
1321  //      e.g.,
1322  //        ldr x0, [x2]
1323  //        ldr x1, [x2, #8]
1324  //        ; becomes
1325  //        ldp x0, x1, [x2]
1326  // 3) Find base register updates that can be merged into the load or store
1327  //    as a base-reg writeback.
1328  //      e.g.,
1329  //        ldr x0, [x2]
1330  //        add x2, x2, #4
1331  //        ; becomes
1332  //        ldr x0, [x2], #4
1333
1334  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1335       enableNarrowLdOpt && MBBI != E;) {
1336    MachineInstr *MI = MBBI;
1337    switch (MI->getOpcode()) {
1338    default:
1339      // Just move on to the next instruction.
1340      ++MBBI;
1341      break;
1342    // Scaled instructions.
1343    case AArch64::LDRBBui:
1344    case AArch64::LDRHHui:
1345    case AArch64::LDRSBWui:
1346    case AArch64::LDRSHWui:
1347    case AArch64::STRBBui:
1348    case AArch64::STRHHui:
1349    // Unscaled instructions.
1350    case AArch64::LDURBBi:
1351    case AArch64::LDURHHi:
1352    case AArch64::LDURSBWi:
1353    case AArch64::LDURSHWi:
1354    case AArch64::STURBBi:
1355    case AArch64::STURHHi: {
1356      if (tryToMergeLdStInst(MBBI)) {
1357        Modified = true;
1358        break;
1359      }
1360      ++MBBI;
1361      break;
1362    }
1363      // FIXME: Do the other instructions.
1364    }
1365  }
1366
1367  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1368       MBBI != E;) {
1369    MachineInstr *MI = MBBI;
1370    switch (MI->getOpcode()) {
1371    default:
1372      // Just move on to the next instruction.
1373      ++MBBI;
1374      break;
1375    // Scaled instructions.
1376    case AArch64::STRSui:
1377    case AArch64::STRDui:
1378    case AArch64::STRQui:
1379    case AArch64::STRXui:
1380    case AArch64::STRWui:
1381    case AArch64::LDRSui:
1382    case AArch64::LDRDui:
1383    case AArch64::LDRQui:
1384    case AArch64::LDRXui:
1385    case AArch64::LDRWui:
1386    case AArch64::LDRSWui:
1387    // Unscaled instructions.
1388    case AArch64::STURSi:
1389    case AArch64::STURDi:
1390    case AArch64::STURQi:
1391    case AArch64::STURWi:
1392    case AArch64::STURXi:
1393    case AArch64::LDURSi:
1394    case AArch64::LDURDi:
1395    case AArch64::LDURQi:
1396    case AArch64::LDURWi:
1397    case AArch64::LDURXi:
1398    case AArch64::LDURSWi: {
1399      if (tryToMergeLdStInst(MBBI)) {
1400        Modified = true;
1401        break;
1402      }
1403      ++MBBI;
1404      break;
1405    }
1406      // FIXME: Do the other instructions.
1407    }
1408  }
1409
1410  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1411       MBBI != E;) {
1412    MachineInstr *MI = MBBI;
1413    // Do update merging. It's simpler to keep this separate from the above
1414    // switch, though not strictly necessary.
1415    unsigned Opc = MI->getOpcode();
1416    switch (Opc) {
1417    default:
1418      // Just move on to the next instruction.
1419      ++MBBI;
1420      break;
1421    // Scaled instructions.
1422    case AArch64::STRSui:
1423    case AArch64::STRDui:
1424    case AArch64::STRQui:
1425    case AArch64::STRXui:
1426    case AArch64::STRWui:
1427    case AArch64::STRHHui:
1428    case AArch64::STRBBui:
1429    case AArch64::LDRSui:
1430    case AArch64::LDRDui:
1431    case AArch64::LDRQui:
1432    case AArch64::LDRXui:
1433    case AArch64::LDRWui:
1434    case AArch64::LDRHHui:
1435    case AArch64::LDRBBui:
1436    // Unscaled instructions.
1437    case AArch64::STURSi:
1438    case AArch64::STURDi:
1439    case AArch64::STURQi:
1440    case AArch64::STURWi:
1441    case AArch64::STURXi:
1442    case AArch64::LDURSi:
1443    case AArch64::LDURDi:
1444    case AArch64::LDURQi:
1445    case AArch64::LDURWi:
1446    case AArch64::LDURXi:
1447    // Paired instructions.
1448    case AArch64::LDPSi:
1449    case AArch64::LDPSWi:
1450    case AArch64::LDPDi:
1451    case AArch64::LDPQi:
1452    case AArch64::LDPWi:
1453    case AArch64::LDPXi:
1454    case AArch64::STPSi:
1455    case AArch64::STPDi:
1456    case AArch64::STPQi:
1457    case AArch64::STPWi:
1458    case AArch64::STPXi: {
1459      // Make sure this is a reg+imm (as opposed to an address reloc).
1460      if (!getLdStOffsetOp(MI).isImm()) {
1461        ++MBBI;
1462        break;
1463      }
1464      // Look forward to try to form a post-index instruction. For example,
1465      // ldr x0, [x20]
1466      // add x20, x20, #32
1467      //   merged into:
1468      // ldr x0, [x20], #32
1469      MachineBasicBlock::iterator Update =
1470          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
1471      if (Update != E) {
1472        // Merge the update into the ld/st.
1473        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1474        Modified = true;
1475        ++NumPostFolded;
1476        break;
1477      }
1478      // Don't know how to handle pre/post-index versions, so move to the next
1479      // instruction.
1480      if (isUnscaledLdSt(Opc)) {
1481        ++MBBI;
1482        break;
1483      }
1484
1485      // Look back to try to find a pre-index instruction. For example,
1486      // add x0, x0, #8
1487      // ldr x1, [x0]
1488      //   merged into:
1489      // ldr x1, [x0, #8]!
1490      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
1491      if (Update != E) {
1492        // Merge the update into the ld/st.
1493        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1494        Modified = true;
1495        ++NumPreFolded;
1496        break;
1497      }
1498      // The immediate in the load/store is scaled by the size of the memory
1499      // operation. The immediate in the add we're looking for,
1500      // however, is not, so adjust here.
1501      int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1502
1503      // Look forward to try to find a post-index instruction. For example,
1504      // ldr x1, [x0, #64]
1505      // add x0, x0, #64
1506      //   merged into:
1507      // ldr x1, [x0, #64]!
1508      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
1509      if (Update != E) {
1510        // Merge the update into the ld/st.
1511        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1512        Modified = true;
1513        ++NumPreFolded;
1514        break;
1515      }
1516
1517      // Nothing found. Just move to the next instruction.
1518      ++MBBI;
1519      break;
1520    }
1521      // FIXME: Do the other instructions.
1522    }
1523  }
1524
1525  return Modified;
1526}
1527
1528bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
1529  bool ProfitableArch = Subtarget->isCortexA57();
1530  // FIXME: The benefit from converting narrow loads into a wider load could be
1531  // microarchitectural as it assumes that a single load with two bitfield
1532  // extracts is cheaper than two narrow loads. Currently, this conversion is
1533  // enabled only in cortex-a57 on which performance benefits were verified.
1534  return ProfitableArch && !Subtarget->requiresStrictAlign();
1535}
1536
1537bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1538  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
1539  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
1540  TRI = Subtarget->getRegisterInfo();
1541
1542  bool Modified = false;
1543  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
1544  for (auto &MBB : Fn)
1545    Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
1546
1547  return Modified;
1548}
1549
1550// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1551// loads and stores near one another?
1552
1553/// createAArch64LoadStoreOptimizationPass - returns an instance of the
1554/// load / store optimization pass.
1555FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1556  return new AArch64LoadStoreOpt();
1557}
1558