AArch64LoadStoreOptimizer.cpp revision cd81d94322a39503e4a3e87b6ee03d4fcb3465fb
1//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains a pass that performs load / store related peephole
11// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AArch64InstrInfo.h"
16#include "MCTargetDesc/AArch64AddressingModes.h"
17#include "llvm/ADT/BitVector.h"
18#include "llvm/CodeGen/MachineBasicBlock.h"
19#include "llvm/CodeGen/MachineFunctionPass.h"
20#include "llvm/CodeGen/MachineInstr.h"
21#include "llvm/CodeGen/MachineInstrBuilder.h"
22#include "llvm/Target/TargetInstrInfo.h"
23#include "llvm/Target/TargetMachine.h"
24#include "llvm/Target/TargetRegisterInfo.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/Debug.h"
27#include "llvm/Support/ErrorHandling.h"
28#include "llvm/Support/raw_ostream.h"
29#include "llvm/ADT/Statistic.h"
30using namespace llvm;
31
32#define DEBUG_TYPE "aarch64-ldst-opt"
33
34/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
35/// load / store instructions to form ldp / stp instructions.
36
37STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
38STATISTIC(NumPostFolded, "Number of post-index updates folded");
39STATISTIC(NumPreFolded, "Number of pre-index updates folded");
40STATISTIC(NumUnscaledPairCreated,
41          "Number of load/store from unscaled generated");
42
43static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
44                                   cl::init(20), cl::Hidden);
45
46// Place holder while testing unscaled load/store combining
47static cl::opt<bool> EnableAArch64UnscaledMemOp(
48    "aarch64-unscaled-mem-op", cl::Hidden,
49    cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
50
51namespace {
52struct AArch64LoadStoreOpt : public MachineFunctionPass {
53  static char ID;
54  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
55
56  const AArch64InstrInfo *TII;
57  const TargetRegisterInfo *TRI;
58
59  // Scan the instructions looking for a load/store that can be combined
60  // with the current instruction into a load/store pair.
61  // Return the matching instruction if one is found, else MBB->end().
62  // If a matching instruction is found, MergeForward is set to true if the
63  // merge is to remove the first instruction and replace the second with
64  // a pair-wise insn, and false if the reverse is true.
65  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
66                                               bool &MergeForward,
67                                               unsigned Limit);
68  // Merge the two instructions indicated into a single pair-wise instruction.
69  // If MergeForward is true, erase the first instruction and fold its
70  // operation into the second. If false, the reverse. Return the instruction
71  // following the first instruction (which may change during processing).
72  MachineBasicBlock::iterator
73  mergePairedInsns(MachineBasicBlock::iterator I,
74                   MachineBasicBlock::iterator Paired, bool MergeForward);
75
76  // Scan the instruction list to find a base register update that can
77  // be combined with the current instruction (a load or store) using
78  // pre or post indexed addressing with writeback. Scan forwards.
79  MachineBasicBlock::iterator
80  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
81                                int Value);
82
83  // Scan the instruction list to find a base register update that can
84  // be combined with the current instruction (a load or store) using
85  // pre or post indexed addressing with writeback. Scan backwards.
86  MachineBasicBlock::iterator
87  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
88
89  // Merge a pre-index base register update into a ld/st instruction.
90  MachineBasicBlock::iterator
91  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
92                        MachineBasicBlock::iterator Update);
93
94  // Merge a post-index base register update into a ld/st instruction.
95  MachineBasicBlock::iterator
96  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
97                         MachineBasicBlock::iterator Update);
98
99  bool optimizeBlock(MachineBasicBlock &MBB);
100
101  bool runOnMachineFunction(MachineFunction &Fn) override;
102
103  const char *getPassName() const override {
104    return "AArch64 load / store optimization pass";
105  }
106
107private:
108  int getMemSize(MachineInstr *MemMI);
109};
110char AArch64LoadStoreOpt::ID = 0;
111}
112
113static bool isUnscaledLdst(unsigned Opc) {
114  switch (Opc) {
115  default:
116    return false;
117  case AArch64::STURSi:
118    return true;
119  case AArch64::STURDi:
120    return true;
121  case AArch64::STURQi:
122    return true;
123  case AArch64::STURWi:
124    return true;
125  case AArch64::STURXi:
126    return true;
127  case AArch64::LDURSi:
128    return true;
129  case AArch64::LDURDi:
130    return true;
131  case AArch64::LDURQi:
132    return true;
133  case AArch64::LDURWi:
134    return true;
135  case AArch64::LDURXi:
136    return true;
137  }
138}
139
140// Size in bytes of the data moved by an unscaled load or store
141int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
142  switch (MemMI->getOpcode()) {
143  default:
144    llvm_unreachable("Opcode has unknown size!");
145  case AArch64::STRSui:
146  case AArch64::STURSi:
147    return 4;
148  case AArch64::STRDui:
149  case AArch64::STURDi:
150    return 8;
151  case AArch64::STRQui:
152  case AArch64::STURQi:
153    return 16;
154  case AArch64::STRWui:
155  case AArch64::STURWi:
156    return 4;
157  case AArch64::STRXui:
158  case AArch64::STURXi:
159    return 8;
160  case AArch64::LDRSui:
161  case AArch64::LDURSi:
162    return 4;
163  case AArch64::LDRDui:
164  case AArch64::LDURDi:
165    return 8;
166  case AArch64::LDRQui:
167  case AArch64::LDURQi:
168    return 16;
169  case AArch64::LDRWui:
170  case AArch64::LDURWi:
171    return 4;
172  case AArch64::LDRXui:
173  case AArch64::LDURXi:
174    return 8;
175  }
176}
177
178static unsigned getMatchingPairOpcode(unsigned Opc) {
179  switch (Opc) {
180  default:
181    llvm_unreachable("Opcode has no pairwise equivalent!");
182  case AArch64::STRSui:
183  case AArch64::STURSi:
184    return AArch64::STPSi;
185  case AArch64::STRDui:
186  case AArch64::STURDi:
187    return AArch64::STPDi;
188  case AArch64::STRQui:
189  case AArch64::STURQi:
190    return AArch64::STPQi;
191  case AArch64::STRWui:
192  case AArch64::STURWi:
193    return AArch64::STPWi;
194  case AArch64::STRXui:
195  case AArch64::STURXi:
196    return AArch64::STPXi;
197  case AArch64::LDRSui:
198  case AArch64::LDURSi:
199    return AArch64::LDPSi;
200  case AArch64::LDRDui:
201  case AArch64::LDURDi:
202    return AArch64::LDPDi;
203  case AArch64::LDRQui:
204  case AArch64::LDURQi:
205    return AArch64::LDPQi;
206  case AArch64::LDRWui:
207  case AArch64::LDURWi:
208    return AArch64::LDPWi;
209  case AArch64::LDRXui:
210  case AArch64::LDURXi:
211    return AArch64::LDPXi;
212  }
213}
214
215static unsigned getPreIndexedOpcode(unsigned Opc) {
216  switch (Opc) {
217  default:
218    llvm_unreachable("Opcode has no pre-indexed equivalent!");
219  case AArch64::STRSui:
220    return AArch64::STRSpre;
221  case AArch64::STRDui:
222    return AArch64::STRDpre;
223  case AArch64::STRQui:
224    return AArch64::STRQpre;
225  case AArch64::STRWui:
226    return AArch64::STRWpre;
227  case AArch64::STRXui:
228    return AArch64::STRXpre;
229  case AArch64::LDRSui:
230    return AArch64::LDRSpre;
231  case AArch64::LDRDui:
232    return AArch64::LDRDpre;
233  case AArch64::LDRQui:
234    return AArch64::LDRQpre;
235  case AArch64::LDRWui:
236    return AArch64::LDRWpre;
237  case AArch64::LDRXui:
238    return AArch64::LDRXpre;
239  }
240}
241
242static unsigned getPostIndexedOpcode(unsigned Opc) {
243  switch (Opc) {
244  default:
245    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
246  case AArch64::STRSui:
247    return AArch64::STRSpost;
248  case AArch64::STRDui:
249    return AArch64::STRDpost;
250  case AArch64::STRQui:
251    return AArch64::STRQpost;
252  case AArch64::STRWui:
253    return AArch64::STRWpost;
254  case AArch64::STRXui:
255    return AArch64::STRXpost;
256  case AArch64::LDRSui:
257    return AArch64::LDRSpost;
258  case AArch64::LDRDui:
259    return AArch64::LDRDpost;
260  case AArch64::LDRQui:
261    return AArch64::LDRQpost;
262  case AArch64::LDRWui:
263    return AArch64::LDRWpost;
264  case AArch64::LDRXui:
265    return AArch64::LDRXpost;
266  }
267}
268
269MachineBasicBlock::iterator
270AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
271                                      MachineBasicBlock::iterator Paired,
272                                      bool MergeForward) {
273  MachineBasicBlock::iterator NextI = I;
274  ++NextI;
275  // If NextI is the second of the two instructions to be merged, we need
276  // to skip one further. Either way we merge will invalidate the iterator,
277  // and we don't need to scan the new instruction, as it's a pairwise
278  // instruction, which we're not considering for further action anyway.
279  if (NextI == Paired)
280    ++NextI;
281
282  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
283  int OffsetStride =
284      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
285
286  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
287  // Insert our new paired instruction after whichever of the paired
288  // instructions MergeForward indicates.
289  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
290  // Also based on MergeForward is from where we copy the base register operand
291  // so we get the flags compatible with the input code.
292  MachineOperand &BaseRegOp =
293      MergeForward ? Paired->getOperand(1) : I->getOperand(1);
294
295  // Which register is Rt and which is Rt2 depends on the offset order.
296  MachineInstr *RtMI, *Rt2MI;
297  if (I->getOperand(2).getImm() ==
298      Paired->getOperand(2).getImm() + OffsetStride) {
299    RtMI = Paired;
300    Rt2MI = I;
301  } else {
302    RtMI = I;
303    Rt2MI = Paired;
304  }
305  // Handle Unscaled
306  int OffsetImm = RtMI->getOperand(2).getImm();
307  if (IsUnscaled && EnableAArch64UnscaledMemOp)
308    OffsetImm /= OffsetStride;
309
310  // Construct the new instruction.
311  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
312                                    I->getDebugLoc(), TII->get(NewOpc))
313                                .addOperand(RtMI->getOperand(0))
314                                .addOperand(Rt2MI->getOperand(0))
315                                .addOperand(BaseRegOp)
316                                .addImm(OffsetImm);
317  (void)MIB;
318
319  // FIXME: Do we need/want to copy the mem operands from the source
320  //        instructions? Probably. What uses them after this?
321
322  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
323  DEBUG(I->print(dbgs()));
324  DEBUG(dbgs() << "    ");
325  DEBUG(Paired->print(dbgs()));
326  DEBUG(dbgs() << "  with instruction:\n    ");
327  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
328  DEBUG(dbgs() << "\n");
329
330  // Erase the old instructions.
331  I->eraseFromParent();
332  Paired->eraseFromParent();
333
334  return NextI;
335}
336
337/// trackRegDefsUses - Remember what registers the specified instruction uses
338/// and modifies.
339static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
340                             BitVector &UsedRegs,
341                             const TargetRegisterInfo *TRI) {
342  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
343    MachineOperand &MO = MI->getOperand(i);
344    if (MO.isRegMask())
345      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
346
347    if (!MO.isReg())
348      continue;
349    unsigned Reg = MO.getReg();
350    if (MO.isDef()) {
351      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
352        ModifiedRegs.set(*AI);
353    } else {
354      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
355      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
356        UsedRegs.set(*AI);
357    }
358  }
359}
360
361static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
362  if (!IsUnscaled && (Offset > 63 || Offset < -64))
363    return false;
364  if (IsUnscaled) {
365    // Convert the byte-offset used by unscaled into an "element" offset used
366    // by the scaled pair load/store instructions.
367    int ElemOffset = Offset / OffsetStride;
368    if (ElemOffset > 63 || ElemOffset < -64)
369      return false;
370  }
371  return true;
372}
373
374// Do alignment, specialized to power of 2 and for signed ints,
375// avoiding having to do a C-style cast from uint_64t to int when
376// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
377// FIXME: Move this function to include/MathExtras.h?
378static int alignTo(int Num, int PowOf2) {
379  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
380}
381
382/// findMatchingInsn - Scan the instructions looking for a load/store that can
383/// be combined with the current instruction into a load/store pair.
384MachineBasicBlock::iterator
385AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
386                                      bool &MergeForward, unsigned Limit) {
387  MachineBasicBlock::iterator E = I->getParent()->end();
388  MachineBasicBlock::iterator MBBI = I;
389  MachineInstr *FirstMI = I;
390  ++MBBI;
391
392  int Opc = FirstMI->getOpcode();
393  bool MayLoad = FirstMI->mayLoad();
394  bool IsUnscaled = isUnscaledLdst(Opc);
395  unsigned Reg = FirstMI->getOperand(0).getReg();
396  unsigned BaseReg = FirstMI->getOperand(1).getReg();
397  int Offset = FirstMI->getOperand(2).getImm();
398
399  // Early exit if the first instruction modifies the base register.
400  // e.g., ldr x0, [x0]
401  // Early exit if the offset if not possible to match. (6 bits of positive
402  // range, plus allow an extra one in case we find a later insn that matches
403  // with Offset-1
404  if (FirstMI->modifiesRegister(BaseReg, TRI))
405    return E;
406  int OffsetStride =
407      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
408  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
409    return E;
410
411  // Track which registers have been modified and used between the first insn
412  // (inclusive) and the second insn.
413  BitVector ModifiedRegs, UsedRegs;
414  ModifiedRegs.resize(TRI->getNumRegs());
415  UsedRegs.resize(TRI->getNumRegs());
416  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
417    MachineInstr *MI = MBBI;
418    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
419    // optimization by changing how far we scan.
420    if (MI->isDebugValue())
421      continue;
422
423    // Now that we know this is a real instruction, count it.
424    ++Count;
425
426    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
427      // If we've found another instruction with the same opcode, check to see
428      // if the base and offset are compatible with our starting instruction.
429      // These instructions all have scaled immediate operands, so we just
430      // check for +1/-1. Make sure to check the new instruction offset is
431      // actually an immediate and not a symbolic reference destined for
432      // a relocation.
433      //
434      // Pairwise instructions have a 7-bit signed offset field. Single insns
435      // have a 12-bit unsigned offset field. To be a valid combine, the
436      // final offset must be in range.
437      unsigned MIBaseReg = MI->getOperand(1).getReg();
438      int MIOffset = MI->getOperand(2).getImm();
439      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
440                                   (Offset + OffsetStride == MIOffset))) {
441        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
442        // If this is a volatile load/store that otherwise matched, stop looking
443        // as something is going on that we don't have enough information to
444        // safely transform. Similarly, stop if we see a hint to avoid pairs.
445        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
446          return E;
447        // If the resultant immediate offset of merging these instructions
448        // is out of range for a pairwise instruction, bail and keep looking.
449        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
450        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
451          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
452          continue;
453        }
454        // If the alignment requirements of the paired (scaled) instruction
455        // can't express the offset of the unscaled input, bail and keep
456        // looking.
457        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
458            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
459          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
460          continue;
461        }
462        // If the destination register of the loads is the same register, bail
463        // and keep looking. A load-pair instruction with both destination
464        // registers the same is UNPREDICTABLE and will result in an exception.
465        if (MayLoad && Reg == MI->getOperand(0).getReg()) {
466          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
467          continue;
468        }
469
470        // If the Rt of the second instruction was not modified or used between
471        // the two instructions, we can combine the second into the first.
472        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
473            !UsedRegs[MI->getOperand(0).getReg()]) {
474          MergeForward = false;
475          return MBBI;
476        }
477
478        // Likewise, if the Rt of the first instruction is not modified or used
479        // between the two instructions, we can combine the first into the
480        // second.
481        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
482            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
483          MergeForward = true;
484          return MBBI;
485        }
486        // Unable to combine these instructions due to interference in between.
487        // Keep looking.
488      }
489    }
490
491    // If the instruction wasn't a matching load or store, but does (or can)
492    // modify memory, stop searching, as we don't have alias analysis or
493    // anything like that to tell us whether the access is tromping on the
494    // locations we care about. The big one we want to catch is calls.
495    //
496    // FIXME: Theoretically, we can do better than that for SP and FP based
497    // references since we can effectively know where those are touching. It's
498    // unclear if it's worth the extra code, though. Most paired instructions
499    // will be sequential, perhaps with a few intervening non-memory related
500    // instructions.
501    if (MI->mayStore() || MI->isCall())
502      return E;
503    // Likewise, if we're matching a store instruction, we don't want to
504    // move across a load, as it may be reading the same location.
505    if (FirstMI->mayStore() && MI->mayLoad())
506      return E;
507
508    // Update modified / uses register lists.
509    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
510
511    // Otherwise, if the base register is modified, we have no match, so
512    // return early.
513    if (ModifiedRegs[BaseReg])
514      return E;
515  }
516  return E;
517}
518
519MachineBasicBlock::iterator
520AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
521                                           MachineBasicBlock::iterator Update) {
522  assert((Update->getOpcode() == AArch64::ADDXri ||
523          Update->getOpcode() == AArch64::SUBXri) &&
524         "Unexpected base register update instruction to merge!");
525  MachineBasicBlock::iterator NextI = I;
526  // Return the instruction following the merged instruction, which is
527  // the instruction following our unmerged load. Unless that's the add/sub
528  // instruction we're merging, in which case it's the one after that.
529  if (++NextI == Update)
530    ++NextI;
531
532  int Value = Update->getOperand(2).getImm();
533  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
534         "Can't merge 1 << 12 offset into pre-indexed load / store");
535  if (Update->getOpcode() == AArch64::SUBXri)
536    Value = -Value;
537
538  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
539  MachineInstrBuilder MIB =
540      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
541          .addOperand(Update->getOperand(0))
542          .addOperand(I->getOperand(0))
543          .addOperand(I->getOperand(1))
544          .addImm(Value);
545  (void)MIB;
546
547  DEBUG(dbgs() << "Creating pre-indexed load/store.");
548  DEBUG(dbgs() << "    Replacing instructions:\n    ");
549  DEBUG(I->print(dbgs()));
550  DEBUG(dbgs() << "    ");
551  DEBUG(Update->print(dbgs()));
552  DEBUG(dbgs() << "  with instruction:\n    ");
553  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
554  DEBUG(dbgs() << "\n");
555
556  // Erase the old instructions for the block.
557  I->eraseFromParent();
558  Update->eraseFromParent();
559
560  return NextI;
561}
562
563MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
564    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
565  assert((Update->getOpcode() == AArch64::ADDXri ||
566          Update->getOpcode() == AArch64::SUBXri) &&
567         "Unexpected base register update instruction to merge!");
568  MachineBasicBlock::iterator NextI = I;
569  // Return the instruction following the merged instruction, which is
570  // the instruction following our unmerged load. Unless that's the add/sub
571  // instruction we're merging, in which case it's the one after that.
572  if (++NextI == Update)
573    ++NextI;
574
575  int Value = Update->getOperand(2).getImm();
576  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
577         "Can't merge 1 << 12 offset into post-indexed load / store");
578  if (Update->getOpcode() == AArch64::SUBXri)
579    Value = -Value;
580
581  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
582  MachineInstrBuilder MIB =
583      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
584          .addOperand(Update->getOperand(0))
585          .addOperand(I->getOperand(0))
586          .addOperand(I->getOperand(1))
587          .addImm(Value);
588  (void)MIB;
589
590  DEBUG(dbgs() << "Creating post-indexed load/store.");
591  DEBUG(dbgs() << "    Replacing instructions:\n    ");
592  DEBUG(I->print(dbgs()));
593  DEBUG(dbgs() << "    ");
594  DEBUG(Update->print(dbgs()));
595  DEBUG(dbgs() << "  with instruction:\n    ");
596  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
597  DEBUG(dbgs() << "\n");
598
599  // Erase the old instructions for the block.
600  I->eraseFromParent();
601  Update->eraseFromParent();
602
603  return NextI;
604}
605
606static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
607                                 int Offset) {
608  switch (MI->getOpcode()) {
609  default:
610    break;
611  case AArch64::SUBXri:
612    // Negate the offset for a SUB instruction.
613    Offset *= -1;
614  // FALLTHROUGH
615  case AArch64::ADDXri:
616    // Make sure it's a vanilla immediate operand, not a relocation or
617    // anything else we can't handle.
618    if (!MI->getOperand(2).isImm())
619      break;
620    // Watch out for 1 << 12 shifted value.
621    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
622      break;
623    // If the instruction has the base register as source and dest and the
624    // immediate will fit in a signed 9-bit integer, then we have a match.
625    if (MI->getOperand(0).getReg() == BaseReg &&
626        MI->getOperand(1).getReg() == BaseReg &&
627        MI->getOperand(2).getImm() <= 255 &&
628        MI->getOperand(2).getImm() >= -256) {
629      // If we have a non-zero Offset, we check that it matches the amount
630      // we're adding to the register.
631      if (!Offset || Offset == MI->getOperand(2).getImm())
632        return true;
633    }
634    break;
635  }
636  return false;
637}
638
639MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
640    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
641  MachineBasicBlock::iterator E = I->getParent()->end();
642  MachineInstr *MemMI = I;
643  MachineBasicBlock::iterator MBBI = I;
644  const MachineFunction &MF = *MemMI->getParent()->getParent();
645
646  unsigned DestReg = MemMI->getOperand(0).getReg();
647  unsigned BaseReg = MemMI->getOperand(1).getReg();
648  int Offset = MemMI->getOperand(2).getImm() *
649               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
650
651  // If the base register overlaps the destination register, we can't
652  // merge the update.
653  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
654    return E;
655
656  // Scan forward looking for post-index opportunities.
657  // Updating instructions can't be formed if the memory insn already
658  // has an offset other than the value we're looking for.
659  if (Offset != Value)
660    return E;
661
662  // Track which registers have been modified and used between the first insn
663  // (inclusive) and the second insn.
664  BitVector ModifiedRegs, UsedRegs;
665  ModifiedRegs.resize(TRI->getNumRegs());
666  UsedRegs.resize(TRI->getNumRegs());
667  ++MBBI;
668  for (unsigned Count = 0; MBBI != E; ++MBBI) {
669    MachineInstr *MI = MBBI;
670    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
671    // optimization by changing how far we scan.
672    if (MI->isDebugValue())
673      continue;
674
675    // Now that we know this is a real instruction, count it.
676    ++Count;
677
678    // If we found a match, return it.
679    if (isMatchingUpdateInsn(MI, BaseReg, Value))
680      return MBBI;
681
682    // Update the status of what the instruction clobbered and used.
683    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
684
685    // Otherwise, if the base register is used or modified, we have no match, so
686    // return early.
687    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
688      return E;
689  }
690  return E;
691}
692
693MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
694    MachineBasicBlock::iterator I, unsigned Limit) {
695  MachineBasicBlock::iterator B = I->getParent()->begin();
696  MachineBasicBlock::iterator E = I->getParent()->end();
697  MachineInstr *MemMI = I;
698  MachineBasicBlock::iterator MBBI = I;
699  const MachineFunction &MF = *MemMI->getParent()->getParent();
700
701  unsigned DestReg = MemMI->getOperand(0).getReg();
702  unsigned BaseReg = MemMI->getOperand(1).getReg();
703  int Offset = MemMI->getOperand(2).getImm();
704  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
705
706  // If the load/store is the first instruction in the block, there's obviously
707  // not any matching update. Ditto if the memory offset isn't zero.
708  if (MBBI == B || Offset != 0)
709    return E;
710  // If the base register overlaps the destination register, we can't
711  // merge the update.
712  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
713    return E;
714
715  // Track which registers have been modified and used between the first insn
716  // (inclusive) and the second insn.
717  BitVector ModifiedRegs, UsedRegs;
718  ModifiedRegs.resize(TRI->getNumRegs());
719  UsedRegs.resize(TRI->getNumRegs());
720  --MBBI;
721  for (unsigned Count = 0; MBBI != B; --MBBI) {
722    MachineInstr *MI = MBBI;
723    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
724    // optimization by changing how far we scan.
725    if (MI->isDebugValue())
726      continue;
727
728    // Now that we know this is a real instruction, count it.
729    ++Count;
730
731    // If we found a match, return it.
732    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
733      return MBBI;
734
735    // Update the status of what the instruction clobbered and used.
736    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
737
738    // Otherwise, if the base register is used or modified, we have no match, so
739    // return early.
740    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
741      return E;
742  }
743  return E;
744}
745
746bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
747  bool Modified = false;
748  // Two tranformations to do here:
749  // 1) Find loads and stores that can be merged into a single load or store
750  //    pair instruction.
751  //      e.g.,
752  //        ldr x0, [x2]
753  //        ldr x1, [x2, #8]
754  //        ; becomes
755  //        ldp x0, x1, [x2]
756  // 2) Find base register updates that can be merged into the load or store
757  //    as a base-reg writeback.
758  //      e.g.,
759  //        ldr x0, [x2]
760  //        add x2, x2, #4
761  //        ; becomes
762  //        ldr x0, [x2], #4
763
764  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
765       MBBI != E;) {
766    MachineInstr *MI = MBBI;
767    switch (MI->getOpcode()) {
768    default:
769      // Just move on to the next instruction.
770      ++MBBI;
771      break;
772    case AArch64::STRSui:
773    case AArch64::STRDui:
774    case AArch64::STRQui:
775    case AArch64::STRXui:
776    case AArch64::STRWui:
777    case AArch64::LDRSui:
778    case AArch64::LDRDui:
779    case AArch64::LDRQui:
780    case AArch64::LDRXui:
781    case AArch64::LDRWui:
782    // do the unscaled versions as well
783    case AArch64::STURSi:
784    case AArch64::STURDi:
785    case AArch64::STURQi:
786    case AArch64::STURWi:
787    case AArch64::STURXi:
788    case AArch64::LDURSi:
789    case AArch64::LDURDi:
790    case AArch64::LDURQi:
791    case AArch64::LDURWi:
792    case AArch64::LDURXi: {
793      // If this is a volatile load/store, don't mess with it.
794      if (MI->hasOrderedMemoryRef()) {
795        ++MBBI;
796        break;
797      }
798      // Make sure this is a reg+imm (as opposed to an address reloc).
799      if (!MI->getOperand(2).isImm()) {
800        ++MBBI;
801        break;
802      }
803      // Check if this load/store has a hint to avoid pair formation.
804      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
805      if (TII->isLdStPairSuppressed(MI)) {
806        ++MBBI;
807        break;
808      }
809      // Look ahead up to ScanLimit instructions for a pairable instruction.
810      bool MergeForward = false;
811      MachineBasicBlock::iterator Paired =
812          findMatchingInsn(MBBI, MergeForward, ScanLimit);
813      if (Paired != E) {
814        // Merge the loads into a pair. Keeping the iterator straight is a
815        // pain, so we let the merge routine tell us what the next instruction
816        // is after it's done mucking about.
817        MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
818
819        Modified = true;
820        ++NumPairCreated;
821        if (isUnscaledLdst(MI->getOpcode()))
822          ++NumUnscaledPairCreated;
823        break;
824      }
825      ++MBBI;
826      break;
827    }
828      // FIXME: Do the other instructions.
829    }
830  }
831
832  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
833       MBBI != E;) {
834    MachineInstr *MI = MBBI;
835    // Do update merging. It's simpler to keep this separate from the above
836    // switch, though not strictly necessary.
837    int Opc = MI->getOpcode();
838    switch (Opc) {
839    default:
840      // Just move on to the next instruction.
841      ++MBBI;
842      break;
843    case AArch64::STRSui:
844    case AArch64::STRDui:
845    case AArch64::STRQui:
846    case AArch64::STRXui:
847    case AArch64::STRWui:
848    case AArch64::LDRSui:
849    case AArch64::LDRDui:
850    case AArch64::LDRQui:
851    case AArch64::LDRXui:
852    case AArch64::LDRWui:
853    // do the unscaled versions as well
854    case AArch64::STURSi:
855    case AArch64::STURDi:
856    case AArch64::STURQi:
857    case AArch64::STURWi:
858    case AArch64::STURXi:
859    case AArch64::LDURSi:
860    case AArch64::LDURDi:
861    case AArch64::LDURQi:
862    case AArch64::LDURWi:
863    case AArch64::LDURXi: {
864      // Make sure this is a reg+imm (as opposed to an address reloc).
865      if (!MI->getOperand(2).isImm()) {
866        ++MBBI;
867        break;
868      }
869      // Look ahead up to ScanLimit instructions for a mergable instruction.
870      MachineBasicBlock::iterator Update =
871          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
872      if (Update != E) {
873        // Merge the update into the ld/st.
874        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
875        Modified = true;
876        ++NumPostFolded;
877        break;
878      }
879      // Don't know how to handle pre/post-index versions, so move to the next
880      // instruction.
881      if (isUnscaledLdst(Opc)) {
882        ++MBBI;
883        break;
884      }
885
886      // Look back to try to find a pre-index instruction. For example,
887      // add x0, x0, #8
888      // ldr x1, [x0]
889      //   merged into:
890      // ldr x1, [x0, #8]!
891      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
892      if (Update != E) {
893        // Merge the update into the ld/st.
894        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
895        Modified = true;
896        ++NumPreFolded;
897        break;
898      }
899
900      // Look forward to try to find a post-index instruction. For example,
901      // ldr x1, [x0, #64]
902      // add x0, x0, #64
903      //   merged into:
904      // ldr x1, [x0, #64]!
905
906      // The immediate in the load/store is scaled by the size of the register
907      // being loaded. The immediate in the add we're looking for,
908      // however, is not, so adjust here.
909      int Value = MI->getOperand(2).getImm() *
910                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
911                      ->getSize();
912      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
913      if (Update != E) {
914        // Merge the update into the ld/st.
915        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
916        Modified = true;
917        ++NumPreFolded;
918        break;
919      }
920
921      // Nothing found. Just move to the next instruction.
922      ++MBBI;
923      break;
924    }
925      // FIXME: Do the other instructions.
926    }
927  }
928
929  return Modified;
930}
931
932bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
933  const TargetMachine &TM = Fn.getTarget();
934  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
935  TRI = TM.getRegisterInfo();
936
937  bool Modified = false;
938  for (auto &MBB : Fn)
939    Modified |= optimizeBlock(MBB);
940
941  return Modified;
942}
943
944// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
945// loads and stores near one another?
946
947/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
948/// optimization pass.
949FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
950  return new AArch64LoadStoreOpt();
951}
952