1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file This file contains a pass that performs load / store related peephole
11/// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#include "ARM.h"
16#include "ARMBaseInstrInfo.h"
17#include "ARMBaseRegisterInfo.h"
18#include "ARMISelLowering.h"
19#include "ARMMachineFunctionInfo.h"
20#include "ARMSubtarget.h"
21#include "MCTargetDesc/ARMAddressingModes.h"
22#include "ThumbRegisterInfo.h"
23#include "llvm/ADT/DenseMap.h"
24#include "llvm/ADT/STLExtras.h"
25#include "llvm/ADT/SmallPtrSet.h"
26#include "llvm/ADT/SmallSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/CodeGen/MachineBasicBlock.h"
30#include "llvm/CodeGen/MachineFunctionPass.h"
31#include "llvm/CodeGen/MachineInstr.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/CodeGen/RegisterClassInfo.h"
35#include "llvm/CodeGen/SelectionDAGNodes.h"
36#include "llvm/CodeGen/LivePhysRegs.h"
37#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/DerivedTypes.h"
39#include "llvm/IR/Function.h"
40#include "llvm/Support/Allocator.h"
41#include "llvm/Support/Debug.h"
42#include "llvm/Support/ErrorHandling.h"
43#include "llvm/Support/raw_ostream.h"
44#include "llvm/Target/TargetInstrInfo.h"
45#include "llvm/Target/TargetMachine.h"
46#include "llvm/Target/TargetRegisterInfo.h"
47using namespace llvm;
48
49#define DEBUG_TYPE "arm-ldst-opt"
50
51STATISTIC(NumLDMGened , "Number of ldm instructions generated");
52STATISTIC(NumSTMGened , "Number of stm instructions generated");
53STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
54STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
55STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
56STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
57STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
58STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
59STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
60STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
61STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
62
63namespace llvm {
64void initializeARMLoadStoreOptPass(PassRegistry &);
65}
66
67#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
68
69namespace {
70  /// Post- register allocation pass the combine load / store instructions to
71  /// form ldm / stm instructions.
72  struct ARMLoadStoreOpt : public MachineFunctionPass {
73    static char ID;
74    ARMLoadStoreOpt() : MachineFunctionPass(ID) {
75      initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
76    }
77
78    const MachineFunction *MF;
79    const TargetInstrInfo *TII;
80    const TargetRegisterInfo *TRI;
81    const ARMSubtarget *STI;
82    const TargetLowering *TL;
83    ARMFunctionInfo *AFI;
84    LivePhysRegs LiveRegs;
85    RegisterClassInfo RegClassInfo;
86    MachineBasicBlock::const_iterator LiveRegPos;
87    bool LiveRegsValid;
88    bool RegClassInfoValid;
89    bool isThumb1, isThumb2;
90
91    bool runOnMachineFunction(MachineFunction &Fn) override;
92
93    const char *getPassName() const override {
94      return ARM_LOAD_STORE_OPT_NAME;
95    }
96
97  private:
98    /// A set of load/store MachineInstrs with same base register sorted by
99    /// offset.
100    struct MemOpQueueEntry {
101      MachineInstr *MI;
102      int Offset;        ///< Load/Store offset.
103      unsigned Position; ///< Position as counted from end of basic block.
104      MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
105        : MI(MI), Offset(Offset), Position(Position) {}
106    };
107    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
108
109    /// A set of MachineInstrs that fulfill (nearly all) conditions to get
110    /// merged into a LDM/STM.
111    struct MergeCandidate {
112      /// List of instructions ordered by load/store offset.
113      SmallVector<MachineInstr*, 4> Instrs;
114      /// Index in Instrs of the instruction being latest in the schedule.
115      unsigned LatestMIIdx;
116      /// Index in Instrs of the instruction being earliest in the schedule.
117      unsigned EarliestMIIdx;
118      /// Index into the basic block where the merged instruction will be
119      /// inserted. (See MemOpQueueEntry.Position)
120      unsigned InsertPos;
121      /// Whether the instructions can be merged into a ldm/stm instruction.
122      bool CanMergeToLSMulti;
123      /// Whether the instructions can be merged into a ldrd/strd instruction.
124      bool CanMergeToLSDouble;
125    };
126    SpecificBumpPtrAllocator<MergeCandidate> Allocator;
127    SmallVector<const MergeCandidate*,4> Candidates;
128    SmallVector<MachineInstr*,4> MergeBaseCandidates;
129
130    void moveLiveRegsBefore(const MachineBasicBlock &MBB,
131                            MachineBasicBlock::const_iterator Before);
132    unsigned findFreeReg(const TargetRegisterClass &RegClass);
133    void UpdateBaseRegUses(MachineBasicBlock &MBB,
134                           MachineBasicBlock::iterator MBBI,
135                           DebugLoc DL, unsigned Base, unsigned WordOffset,
136                           ARMCC::CondCodes Pred, unsigned PredReg);
137    MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
138        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
139        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
140        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
141    MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
142        MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
143        bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
144        DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
145    void FormCandidates(const MemOpQueue &MemOps);
146    MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
147    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
148                             MachineBasicBlock::iterator &MBBI);
149    bool MergeBaseUpdateLoadStore(MachineInstr *MI);
150    bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
151    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
152    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
153    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
154    bool CombineMovBx(MachineBasicBlock &MBB);
155  };
156  char ARMLoadStoreOpt::ID = 0;
157}
158
159INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
160
161static bool definesCPSR(const MachineInstr *MI) {
162  for (const auto &MO : MI->operands()) {
163    if (!MO.isReg())
164      continue;
165    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
166      // If the instruction has live CPSR def, then it's not safe to fold it
167      // into load / store.
168      return true;
169  }
170
171  return false;
172}
173
174static int getMemoryOpOffset(const MachineInstr *MI) {
175  unsigned Opcode = MI->getOpcode();
176  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
177  unsigned NumOperands = MI->getDesc().getNumOperands();
178  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
179
180  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
181      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
182      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
183      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
184    return OffField;
185
186  // Thumb1 immediate offsets are scaled by 4
187  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
188      Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
189    return OffField * 4;
190
191  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
192    : ARM_AM::getAM5Offset(OffField) * 4;
193  ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
194    : ARM_AM::getAM5Op(OffField);
195
196  if (Op == ARM_AM::sub)
197    return -Offset;
198
199  return Offset;
200}
201
202static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
203  return MI.getOperand(1);
204}
205
206static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
207  return MI.getOperand(0);
208}
209
210static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
211  switch (Opcode) {
212  default: llvm_unreachable("Unhandled opcode!");
213  case ARM::LDRi12:
214    ++NumLDMGened;
215    switch (Mode) {
216    default: llvm_unreachable("Unhandled submode!");
217    case ARM_AM::ia: return ARM::LDMIA;
218    case ARM_AM::da: return ARM::LDMDA;
219    case ARM_AM::db: return ARM::LDMDB;
220    case ARM_AM::ib: return ARM::LDMIB;
221    }
222  case ARM::STRi12:
223    ++NumSTMGened;
224    switch (Mode) {
225    default: llvm_unreachable("Unhandled submode!");
226    case ARM_AM::ia: return ARM::STMIA;
227    case ARM_AM::da: return ARM::STMDA;
228    case ARM_AM::db: return ARM::STMDB;
229    case ARM_AM::ib: return ARM::STMIB;
230    }
231  case ARM::tLDRi:
232  case ARM::tLDRspi:
233    // tLDMIA is writeback-only - unless the base register is in the input
234    // reglist.
235    ++NumLDMGened;
236    switch (Mode) {
237    default: llvm_unreachable("Unhandled submode!");
238    case ARM_AM::ia: return ARM::tLDMIA;
239    }
240  case ARM::tSTRi:
241  case ARM::tSTRspi:
242    // There is no non-writeback tSTMIA either.
243    ++NumSTMGened;
244    switch (Mode) {
245    default: llvm_unreachable("Unhandled submode!");
246    case ARM_AM::ia: return ARM::tSTMIA_UPD;
247    }
248  case ARM::t2LDRi8:
249  case ARM::t2LDRi12:
250    ++NumLDMGened;
251    switch (Mode) {
252    default: llvm_unreachable("Unhandled submode!");
253    case ARM_AM::ia: return ARM::t2LDMIA;
254    case ARM_AM::db: return ARM::t2LDMDB;
255    }
256  case ARM::t2STRi8:
257  case ARM::t2STRi12:
258    ++NumSTMGened;
259    switch (Mode) {
260    default: llvm_unreachable("Unhandled submode!");
261    case ARM_AM::ia: return ARM::t2STMIA;
262    case ARM_AM::db: return ARM::t2STMDB;
263    }
264  case ARM::VLDRS:
265    ++NumVLDMGened;
266    switch (Mode) {
267    default: llvm_unreachable("Unhandled submode!");
268    case ARM_AM::ia: return ARM::VLDMSIA;
269    case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
270    }
271  case ARM::VSTRS:
272    ++NumVSTMGened;
273    switch (Mode) {
274    default: llvm_unreachable("Unhandled submode!");
275    case ARM_AM::ia: return ARM::VSTMSIA;
276    case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
277    }
278  case ARM::VLDRD:
279    ++NumVLDMGened;
280    switch (Mode) {
281    default: llvm_unreachable("Unhandled submode!");
282    case ARM_AM::ia: return ARM::VLDMDIA;
283    case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
284    }
285  case ARM::VSTRD:
286    ++NumVSTMGened;
287    switch (Mode) {
288    default: llvm_unreachable("Unhandled submode!");
289    case ARM_AM::ia: return ARM::VSTMDIA;
290    case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
291    }
292  }
293}
294
295static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
296  switch (Opcode) {
297  default: llvm_unreachable("Unhandled opcode!");
298  case ARM::LDMIA_RET:
299  case ARM::LDMIA:
300  case ARM::LDMIA_UPD:
301  case ARM::STMIA:
302  case ARM::STMIA_UPD:
303  case ARM::tLDMIA:
304  case ARM::tLDMIA_UPD:
305  case ARM::tSTMIA_UPD:
306  case ARM::t2LDMIA_RET:
307  case ARM::t2LDMIA:
308  case ARM::t2LDMIA_UPD:
309  case ARM::t2STMIA:
310  case ARM::t2STMIA_UPD:
311  case ARM::VLDMSIA:
312  case ARM::VLDMSIA_UPD:
313  case ARM::VSTMSIA:
314  case ARM::VSTMSIA_UPD:
315  case ARM::VLDMDIA:
316  case ARM::VLDMDIA_UPD:
317  case ARM::VSTMDIA:
318  case ARM::VSTMDIA_UPD:
319    return ARM_AM::ia;
320
321  case ARM::LDMDA:
322  case ARM::LDMDA_UPD:
323  case ARM::STMDA:
324  case ARM::STMDA_UPD:
325    return ARM_AM::da;
326
327  case ARM::LDMDB:
328  case ARM::LDMDB_UPD:
329  case ARM::STMDB:
330  case ARM::STMDB_UPD:
331  case ARM::t2LDMDB:
332  case ARM::t2LDMDB_UPD:
333  case ARM::t2STMDB:
334  case ARM::t2STMDB_UPD:
335  case ARM::VLDMSDB_UPD:
336  case ARM::VSTMSDB_UPD:
337  case ARM::VLDMDDB_UPD:
338  case ARM::VSTMDDB_UPD:
339    return ARM_AM::db;
340
341  case ARM::LDMIB:
342  case ARM::LDMIB_UPD:
343  case ARM::STMIB:
344  case ARM::STMIB_UPD:
345    return ARM_AM::ib;
346  }
347}
348
349static bool isT1i32Load(unsigned Opc) {
350  return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
351}
352
353static bool isT2i32Load(unsigned Opc) {
354  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
355}
356
357static bool isi32Load(unsigned Opc) {
358  return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
359}
360
361static bool isT1i32Store(unsigned Opc) {
362  return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
363}
364
365static bool isT2i32Store(unsigned Opc) {
366  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
367}
368
369static bool isi32Store(unsigned Opc) {
370  return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
371}
372
373static bool isLoadSingle(unsigned Opc) {
374  return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
375}
376
377static unsigned getImmScale(unsigned Opc) {
378  switch (Opc) {
379  default: llvm_unreachable("Unhandled opcode!");
380  case ARM::tLDRi:
381  case ARM::tSTRi:
382  case ARM::tLDRspi:
383  case ARM::tSTRspi:
384    return 1;
385  case ARM::tLDRHi:
386  case ARM::tSTRHi:
387    return 2;
388  case ARM::tLDRBi:
389  case ARM::tSTRBi:
390    return 4;
391  }
392}
393
394static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
395  switch (MI->getOpcode()) {
396  default: return 0;
397  case ARM::LDRi12:
398  case ARM::STRi12:
399  case ARM::tLDRi:
400  case ARM::tSTRi:
401  case ARM::tLDRspi:
402  case ARM::tSTRspi:
403  case ARM::t2LDRi8:
404  case ARM::t2LDRi12:
405  case ARM::t2STRi8:
406  case ARM::t2STRi12:
407  case ARM::VLDRS:
408  case ARM::VSTRS:
409    return 4;
410  case ARM::VLDRD:
411  case ARM::VSTRD:
412    return 8;
413  case ARM::LDMIA:
414  case ARM::LDMDA:
415  case ARM::LDMDB:
416  case ARM::LDMIB:
417  case ARM::STMIA:
418  case ARM::STMDA:
419  case ARM::STMDB:
420  case ARM::STMIB:
421  case ARM::tLDMIA:
422  case ARM::tLDMIA_UPD:
423  case ARM::tSTMIA_UPD:
424  case ARM::t2LDMIA:
425  case ARM::t2LDMDB:
426  case ARM::t2STMIA:
427  case ARM::t2STMDB:
428  case ARM::VLDMSIA:
429  case ARM::VSTMSIA:
430    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
431  case ARM::VLDMDIA:
432  case ARM::VSTMDIA:
433    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
434  }
435}
436
437/// Update future uses of the base register with the offset introduced
438/// due to writeback. This function only works on Thumb1.
439void
440ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
441                                   MachineBasicBlock::iterator MBBI,
442                                   DebugLoc DL, unsigned Base,
443                                   unsigned WordOffset,
444                                   ARMCC::CondCodes Pred, unsigned PredReg) {
445  assert(isThumb1 && "Can only update base register uses for Thumb1!");
446  // Start updating any instructions with immediate offsets. Insert a SUB before
447  // the first non-updateable instruction (if any).
448  for (; MBBI != MBB.end(); ++MBBI) {
449    bool InsertSub = false;
450    unsigned Opc = MBBI->getOpcode();
451
452    if (MBBI->readsRegister(Base)) {
453      int Offset;
454      bool IsLoad =
455        Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
456      bool IsStore =
457        Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
458
459      if (IsLoad || IsStore) {
460        // Loads and stores with immediate offsets can be updated, but only if
461        // the new offset isn't negative.
462        // The MachineOperand containing the offset immediate is the last one
463        // before predicates.
464        MachineOperand &MO =
465          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
466        // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
467        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
468
469        // If storing the base register, it needs to be reset first.
470        unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
471
472        if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
473          MO.setImm(Offset);
474        else
475          InsertSub = true;
476
477      } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
478                 !definesCPSR(MBBI)) {
479        // SUBS/ADDS using this register, with a dead def of the CPSR.
480        // Merge it with the update; if the merged offset is too large,
481        // insert a new sub instead.
482        MachineOperand &MO =
483          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
484        Offset = (Opc == ARM::tSUBi8) ?
485          MO.getImm() + WordOffset * 4 :
486          MO.getImm() - WordOffset * 4 ;
487        if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
488          // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
489          // Offset == 0.
490          MO.setImm(Offset);
491          // The base register has now been reset, so exit early.
492          return;
493        } else {
494          InsertSub = true;
495        }
496
497      } else {
498        // Can't update the instruction.
499        InsertSub = true;
500      }
501
502    } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
503      // Since SUBS sets the condition flags, we can't place the base reset
504      // after an instruction that has a live CPSR def.
505      // The base register might also contain an argument for a function call.
506      InsertSub = true;
507    }
508
509    if (InsertSub) {
510      // An instruction above couldn't be updated, so insert a sub.
511      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
512        .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
513      return;
514    }
515
516    if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
517      // Register got killed. Stop updating.
518      return;
519  }
520
521  // End of block was reached.
522  if (MBB.succ_size() > 0) {
523    // FIXME: Because of a bug, live registers are sometimes missing from
524    // the successor blocks' live-in sets. This means we can't trust that
525    // information and *always* have to reset at the end of a block.
526    // See PR21029.
527    if (MBBI != MBB.end()) --MBBI;
528    AddDefaultT1CC(
529      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
530      .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
531  }
532}
533
534/// Return the first register of class \p RegClass that is not in \p Regs.
535unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
536  if (!RegClassInfoValid) {
537    RegClassInfo.runOnMachineFunction(*MF);
538    RegClassInfoValid = true;
539  }
540
541  for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
542    if (!LiveRegs.contains(Reg))
543      return Reg;
544  return 0;
545}
546
547/// Compute live registers just before instruction \p Before (in normal schedule
548/// direction). Computes backwards so multiple queries in the same block must
549/// come in reverse order.
550void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
551    MachineBasicBlock::const_iterator Before) {
552  // Initialize if we never queried in this block.
553  if (!LiveRegsValid) {
554    LiveRegs.init(TRI);
555    LiveRegs.addLiveOuts(&MBB, true);
556    LiveRegPos = MBB.end();
557    LiveRegsValid = true;
558  }
559  // Move backward just before the "Before" position.
560  while (LiveRegPos != Before) {
561    --LiveRegPos;
562    LiveRegs.stepBackward(*LiveRegPos);
563  }
564}
565
566static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
567                        unsigned Reg) {
568  for (const std::pair<unsigned, bool> &R : Regs)
569    if (R.first == Reg)
570      return true;
571  return false;
572}
573
574/// Create and insert a LDM or STM with Base as base register and registers in
575/// Regs as the register operands that would be loaded / stored.  It returns
576/// true if the transformation is done.
577MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
578    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
579    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
580    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
581  unsigned NumRegs = Regs.size();
582  assert(NumRegs > 1);
583
584  // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
585  // Compute liveness information for that register to make the decision.
586  bool SafeToClobberCPSR = !isThumb1 ||
587    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
588     MachineBasicBlock::LQR_Dead);
589
590  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
591
592  // Exception: If the base register is in the input reglist, Thumb1 LDM is
593  // non-writeback.
594  // It's also not possible to merge an STR of the base register in Thumb1.
595  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
596    assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
597    if (Opcode == ARM::tLDRi) {
598      Writeback = false;
599    } else if (Opcode == ARM::tSTRi) {
600      return nullptr;
601    }
602  }
603
604  ARM_AM::AMSubMode Mode = ARM_AM::ia;
605  // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
606  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
607  bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
608
609  if (Offset == 4 && haveIBAndDA) {
610    Mode = ARM_AM::ib;
611  } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
612    Mode = ARM_AM::da;
613  } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
614    // VLDM/VSTM do not support DB mode without also updating the base reg.
615    Mode = ARM_AM::db;
616  } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
617    // Check if this is a supported opcode before inserting instructions to
618    // calculate a new base register.
619    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
620
621    // If starting offset isn't zero, insert a MI to materialize a new base.
622    // But only do so if it is cost effective, i.e. merging more than two
623    // loads / stores.
624    if (NumRegs <= 2)
625      return nullptr;
626
627    // On Thumb1, it's not worth materializing a new base register without
628    // clobbering the CPSR (i.e. not using ADDS/SUBS).
629    if (!SafeToClobberCPSR)
630      return nullptr;
631
632    unsigned NewBase;
633    if (isi32Load(Opcode)) {
634      // If it is a load, then just use one of the destination registers
635      // as the new base. Will no longer be writeback in Thumb1.
636      NewBase = Regs[NumRegs-1].first;
637      Writeback = false;
638    } else {
639      // Find a free register that we can use as scratch register.
640      moveLiveRegsBefore(MBB, InsertBefore);
641      // The merged instruction does not exist yet but will use several Regs if
642      // it is a Store.
643      if (!isLoadSingle(Opcode))
644        for (const std::pair<unsigned, bool> &R : Regs)
645          LiveRegs.addReg(R.first);
646
647      NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
648      if (NewBase == 0)
649        return nullptr;
650    }
651
652    int BaseOpc =
653      isThumb2 ? ARM::t2ADDri :
654      (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
655      (isThumb1 && Offset < 8) ? ARM::tADDi3 :
656      isThumb1 ? ARM::tADDi8  : ARM::ADDri;
657
658    if (Offset < 0) {
659      Offset = - Offset;
660      BaseOpc =
661        isThumb2 ? ARM::t2SUBri :
662        (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
663        isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
664    }
665
666    if (!TL->isLegalAddImmediate(Offset))
667      // FIXME: Try add with register operand?
668      return nullptr; // Probably not worth it then.
669
670    // We can only append a kill flag to the add/sub input if the value is not
671    // used in the register list of the stm as well.
672    bool KillOldBase = BaseKill &&
673      (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
674
675    if (isThumb1) {
676      // Thumb1: depending on immediate size, use either
677      //   ADDS NewBase, Base, #imm3
678      // or
679      //   MOV  NewBase, Base
680      //   ADDS NewBase, #imm8.
681      if (Base != NewBase &&
682          (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
683        // Need to insert a MOV to the new base first.
684        if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
685            !STI->hasV6Ops()) {
686          // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
687          if (Pred != ARMCC::AL)
688            return nullptr;
689          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
690            .addReg(Base, getKillRegState(KillOldBase));
691        } else
692          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
693            .addReg(Base, getKillRegState(KillOldBase))
694            .addImm(Pred).addReg(PredReg);
695
696        // The following ADDS/SUBS becomes an update.
697        Base = NewBase;
698        KillOldBase = true;
699      }
700      if (BaseOpc == ARM::tADDrSPi) {
701        assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
702        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
703          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
704          .addImm(Pred).addReg(PredReg);
705      } else
706        AddDefaultT1CC(
707          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
708          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
709          .addImm(Pred).addReg(PredReg);
710    } else {
711      BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
712        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
713        .addImm(Pred).addReg(PredReg).addReg(0);
714    }
715    Base = NewBase;
716    BaseKill = true; // New base is always killed straight away.
717  }
718
719  bool isDef = isLoadSingle(Opcode);
720
721  // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
722  // base register writeback.
723  Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
724  if (!Opcode)
725    return nullptr;
726
727  // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
728  // - There is no writeback (LDM of base register),
729  // - the base register is killed by the merged instruction,
730  // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
731  //   to reset the base register.
732  // Otherwise, don't merge.
733  // It's safe to return here since the code to materialize a new base register
734  // above is also conditional on SafeToClobberCPSR.
735  if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
736    return nullptr;
737
738  MachineInstrBuilder MIB;
739
740  if (Writeback) {
741    assert(isThumb1 && "expected Writeback only inThumb1");
742    if (Opcode == ARM::tLDMIA) {
743      assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
744      // Update tLDMIA with writeback if necessary.
745      Opcode = ARM::tLDMIA_UPD;
746    }
747
748    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
749
750    // Thumb1: we might need to set base writeback when building the MI.
751    MIB.addReg(Base, getDefRegState(true))
752       .addReg(Base, getKillRegState(BaseKill));
753
754    // The base isn't dead after a merged instruction with writeback.
755    // Insert a sub instruction after the newly formed instruction to reset.
756    if (!BaseKill)
757      UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
758
759  } else {
760    // No writeback, simply build the MachineInstr.
761    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
762    MIB.addReg(Base, getKillRegState(BaseKill));
763  }
764
765  MIB.addImm(Pred).addReg(PredReg);
766
767  for (const std::pair<unsigned, bool> &R : Regs)
768    MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
769
770  return MIB.getInstr();
771}
772
773MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
774    MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
775    bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
776    DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
777  bool IsLoad = isi32Load(Opcode);
778  assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
779  unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
780
781  assert(Regs.size() == 2);
782  MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
783                                    TII->get(LoadStoreOpcode));
784  if (IsLoad) {
785    MIB.addReg(Regs[0].first, RegState::Define)
786       .addReg(Regs[1].first, RegState::Define);
787  } else {
788    MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
789       .addReg(Regs[1].first, getKillRegState(Regs[1].second));
790  }
791  MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
792  return MIB.getInstr();
793}
794
795/// Call MergeOps and update MemOps and merges accordingly on success.
796MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
797  const MachineInstr *First = Cand.Instrs.front();
798  unsigned Opcode = First->getOpcode();
799  bool IsLoad = isLoadSingle(Opcode);
800  SmallVector<std::pair<unsigned, bool>, 8> Regs;
801  SmallVector<unsigned, 4> ImpDefs;
802  DenseSet<unsigned> KilledRegs;
803  DenseSet<unsigned> UsedRegs;
804  // Determine list of registers and list of implicit super-register defs.
805  for (const MachineInstr *MI : Cand.Instrs) {
806    const MachineOperand &MO = getLoadStoreRegOp(*MI);
807    unsigned Reg = MO.getReg();
808    bool IsKill = MO.isKill();
809    if (IsKill)
810      KilledRegs.insert(Reg);
811    Regs.push_back(std::make_pair(Reg, IsKill));
812    UsedRegs.insert(Reg);
813
814    if (IsLoad) {
815      // Collect any implicit defs of super-registers, after merging we can't
816      // be sure anymore that we properly preserved these live ranges and must
817      // removed these implicit operands.
818      for (const MachineOperand &MO : MI->implicit_operands()) {
819        if (!MO.isReg() || !MO.isDef() || MO.isDead())
820          continue;
821        assert(MO.isImplicit());
822        unsigned DefReg = MO.getReg();
823
824        if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
825          continue;
826        // We can ignore cases where the super-reg is read and written.
827        if (MI->readsRegister(DefReg))
828          continue;
829        ImpDefs.push_back(DefReg);
830      }
831    }
832  }
833
834  // Attempt the merge.
835  typedef MachineBasicBlock::iterator iterator;
836  MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
837  iterator InsertBefore = std::next(iterator(LatestMI));
838  MachineBasicBlock &MBB = *LatestMI->getParent();
839  unsigned Offset = getMemoryOpOffset(First);
840  unsigned Base = getLoadStoreBaseOp(*First).getReg();
841  bool BaseKill = LatestMI->killsRegister(Base);
842  unsigned PredReg = 0;
843  ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
844  DebugLoc DL = First->getDebugLoc();
845  MachineInstr *Merged = nullptr;
846  if (Cand.CanMergeToLSDouble)
847    Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
848                                   Opcode, Pred, PredReg, DL, Regs);
849  if (!Merged && Cand.CanMergeToLSMulti)
850    Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
851                                  Opcode, Pred, PredReg, DL, Regs);
852  if (!Merged)
853    return nullptr;
854
855  // Determine earliest instruction that will get removed. We then keep an
856  // iterator just above it so the following erases don't invalidated it.
857  iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
858  bool EarliestAtBegin = false;
859  if (EarliestI == MBB.begin()) {
860    EarliestAtBegin = true;
861  } else {
862    EarliestI = std::prev(EarliestI);
863  }
864
865  // Remove instructions which have been merged.
866  for (MachineInstr *MI : Cand.Instrs)
867    MBB.erase(MI);
868
869  // Determine range between the earliest removed instruction and the new one.
870  if (EarliestAtBegin)
871    EarliestI = MBB.begin();
872  else
873    EarliestI = std::next(EarliestI);
874  auto FixupRange = make_range(EarliestI, iterator(Merged));
875
876  if (isLoadSingle(Opcode)) {
877    // If the previous loads defined a super-reg, then we have to mark earlier
878    // operands undef; Replicate the super-reg def on the merged instruction.
879    for (MachineInstr &MI : FixupRange) {
880      for (unsigned &ImpDefReg : ImpDefs) {
881        for (MachineOperand &MO : MI.implicit_operands()) {
882          if (!MO.isReg() || MO.getReg() != ImpDefReg)
883            continue;
884          if (MO.readsReg())
885            MO.setIsUndef();
886          else if (MO.isDef())
887            ImpDefReg = 0;
888        }
889      }
890    }
891
892    MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
893    for (unsigned ImpDef : ImpDefs)
894      MIB.addReg(ImpDef, RegState::ImplicitDefine);
895  } else {
896    // Remove kill flags: We are possibly storing the values later now.
897    assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
898    for (MachineInstr &MI : FixupRange) {
899      for (MachineOperand &MO : MI.uses()) {
900        if (!MO.isReg() || !MO.isKill())
901          continue;
902        if (UsedRegs.count(MO.getReg()))
903          MO.setIsKill(false);
904      }
905    }
906    assert(ImpDefs.empty());
907  }
908
909  return Merged;
910}
911
912static bool isValidLSDoubleOffset(int Offset) {
913  unsigned Value = abs(Offset);
914  // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
915  // multiplied by 4.
916  return (Value % 4) == 0 && Value < 1024;
917}
918
919/// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
920void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
921  const MachineInstr *FirstMI = MemOps[0].MI;
922  unsigned Opcode = FirstMI->getOpcode();
923  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
924  unsigned Size = getLSMultipleTransferSize(FirstMI);
925
926  unsigned SIndex = 0;
927  unsigned EIndex = MemOps.size();
928  do {
929    // Look at the first instruction.
930    const MachineInstr *MI = MemOps[SIndex].MI;
931    int Offset = MemOps[SIndex].Offset;
932    const MachineOperand &PMO = getLoadStoreRegOp(*MI);
933    unsigned PReg = PMO.getReg();
934    unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
935    unsigned Latest = SIndex;
936    unsigned Earliest = SIndex;
937    unsigned Count = 1;
938    bool CanMergeToLSDouble =
939      STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
940    // ARM errata 602117: LDRD with base in list may result in incorrect base
941    // register when interrupted or faulted.
942    if (STI->isCortexM3() && isi32Load(Opcode) &&
943        PReg == getLoadStoreBaseOp(*MI).getReg())
944      CanMergeToLSDouble = false;
945
946    bool CanMergeToLSMulti = true;
947    // On swift vldm/vstm starting with an odd register number as that needs
948    // more uops than single vldrs.
949    if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
950      CanMergeToLSMulti = false;
951
952    // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
953    // deprecated; LDM to PC is fine but cannot happen here.
954    if (PReg == ARM::SP || PReg == ARM::PC)
955      CanMergeToLSMulti = CanMergeToLSDouble = false;
956
957    // Merge following instructions where possible.
958    for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
959      int NewOffset = MemOps[I].Offset;
960      if (NewOffset != Offset + (int)Size)
961        break;
962      const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
963      unsigned Reg = MO.getReg();
964      if (Reg == ARM::SP || Reg == ARM::PC)
965        break;
966
967      // See if the current load/store may be part of a multi load/store.
968      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
969      bool PartOfLSMulti = CanMergeToLSMulti;
970      if (PartOfLSMulti) {
971        // Register numbers must be in ascending order.
972        if (RegNum <= PRegNum)
973          PartOfLSMulti = false;
974        // For VFP / NEON load/store multiples, the registers must be
975        // consecutive and within the limit on the number of registers per
976        // instruction.
977        else if (!isNotVFP && RegNum != PRegNum+1)
978          PartOfLSMulti = false;
979      }
980      // See if the current load/store may be part of a double load/store.
981      bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
982
983      if (!PartOfLSMulti && !PartOfLSDouble)
984        break;
985      CanMergeToLSMulti &= PartOfLSMulti;
986      CanMergeToLSDouble &= PartOfLSDouble;
987      // Track MemOp with latest and earliest position (Positions are
988      // counted in reverse).
989      unsigned Position = MemOps[I].Position;
990      if (Position < MemOps[Latest].Position)
991        Latest = I;
992      else if (Position > MemOps[Earliest].Position)
993        Earliest = I;
994      // Prepare for next MemOp.
995      Offset += Size;
996      PRegNum = RegNum;
997    }
998
999    // Form a candidate from the Ops collected so far.
1000    MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
1001    for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
1002      Candidate->Instrs.push_back(MemOps[C].MI);
1003    Candidate->LatestMIIdx = Latest - SIndex;
1004    Candidate->EarliestMIIdx = Earliest - SIndex;
1005    Candidate->InsertPos = MemOps[Latest].Position;
1006    if (Count == 1)
1007      CanMergeToLSMulti = CanMergeToLSDouble = false;
1008    Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
1009    Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
1010    Candidates.push_back(Candidate);
1011    // Continue after the chain.
1012    SIndex += Count;
1013  } while (SIndex < EIndex);
1014}
1015
1016static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
1017                                            ARM_AM::AMSubMode Mode) {
1018  switch (Opc) {
1019  default: llvm_unreachable("Unhandled opcode!");
1020  case ARM::LDMIA:
1021  case ARM::LDMDA:
1022  case ARM::LDMDB:
1023  case ARM::LDMIB:
1024    switch (Mode) {
1025    default: llvm_unreachable("Unhandled submode!");
1026    case ARM_AM::ia: return ARM::LDMIA_UPD;
1027    case ARM_AM::ib: return ARM::LDMIB_UPD;
1028    case ARM_AM::da: return ARM::LDMDA_UPD;
1029    case ARM_AM::db: return ARM::LDMDB_UPD;
1030    }
1031  case ARM::STMIA:
1032  case ARM::STMDA:
1033  case ARM::STMDB:
1034  case ARM::STMIB:
1035    switch (Mode) {
1036    default: llvm_unreachable("Unhandled submode!");
1037    case ARM_AM::ia: return ARM::STMIA_UPD;
1038    case ARM_AM::ib: return ARM::STMIB_UPD;
1039    case ARM_AM::da: return ARM::STMDA_UPD;
1040    case ARM_AM::db: return ARM::STMDB_UPD;
1041    }
1042  case ARM::t2LDMIA:
1043  case ARM::t2LDMDB:
1044    switch (Mode) {
1045    default: llvm_unreachable("Unhandled submode!");
1046    case ARM_AM::ia: return ARM::t2LDMIA_UPD;
1047    case ARM_AM::db: return ARM::t2LDMDB_UPD;
1048    }
1049  case ARM::t2STMIA:
1050  case ARM::t2STMDB:
1051    switch (Mode) {
1052    default: llvm_unreachable("Unhandled submode!");
1053    case ARM_AM::ia: return ARM::t2STMIA_UPD;
1054    case ARM_AM::db: return ARM::t2STMDB_UPD;
1055    }
1056  case ARM::VLDMSIA:
1057    switch (Mode) {
1058    default: llvm_unreachable("Unhandled submode!");
1059    case ARM_AM::ia: return ARM::VLDMSIA_UPD;
1060    case ARM_AM::db: return ARM::VLDMSDB_UPD;
1061    }
1062  case ARM::VLDMDIA:
1063    switch (Mode) {
1064    default: llvm_unreachable("Unhandled submode!");
1065    case ARM_AM::ia: return ARM::VLDMDIA_UPD;
1066    case ARM_AM::db: return ARM::VLDMDDB_UPD;
1067    }
1068  case ARM::VSTMSIA:
1069    switch (Mode) {
1070    default: llvm_unreachable("Unhandled submode!");
1071    case ARM_AM::ia: return ARM::VSTMSIA_UPD;
1072    case ARM_AM::db: return ARM::VSTMSDB_UPD;
1073    }
1074  case ARM::VSTMDIA:
1075    switch (Mode) {
1076    default: llvm_unreachable("Unhandled submode!");
1077    case ARM_AM::ia: return ARM::VSTMDIA_UPD;
1078    case ARM_AM::db: return ARM::VSTMDDB_UPD;
1079    }
1080  }
1081}
1082
1083/// Check if the given instruction increments or decrements a register and
1084/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
1085/// generated by the instruction are possibly read as well.
1086static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
1087                                  ARMCC::CondCodes Pred, unsigned PredReg) {
1088  bool CheckCPSRDef;
1089  int Scale;
1090  switch (MI.getOpcode()) {
1091  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
1092  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
1093  case ARM::t2SUBri:
1094  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
1095  case ARM::t2ADDri:
1096  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
1097  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
1098  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
1099  default: return 0;
1100  }
1101
1102  unsigned MIPredReg;
1103  if (MI.getOperand(0).getReg() != Reg ||
1104      MI.getOperand(1).getReg() != Reg ||
1105      getInstrPredicate(&MI, MIPredReg) != Pred ||
1106      MIPredReg != PredReg)
1107    return 0;
1108
1109  if (CheckCPSRDef && definesCPSR(&MI))
1110    return 0;
1111  return MI.getOperand(2).getImm() * Scale;
1112}
1113
1114/// Searches for an increment or decrement of \p Reg before \p MBBI.
1115static MachineBasicBlock::iterator
1116findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
1117                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1118  Offset = 0;
1119  MachineBasicBlock &MBB = *MBBI->getParent();
1120  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1121  MachineBasicBlock::iterator EndMBBI = MBB.end();
1122  if (MBBI == BeginMBBI)
1123    return EndMBBI;
1124
1125  // Skip debug values.
1126  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1127  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
1128    --PrevMBBI;
1129
1130  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
1131  return Offset == 0 ? EndMBBI : PrevMBBI;
1132}
1133
1134/// Searches for a increment or decrement of \p Reg after \p MBBI.
1135static MachineBasicBlock::iterator
1136findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
1137                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1138  Offset = 0;
1139  MachineBasicBlock &MBB = *MBBI->getParent();
1140  MachineBasicBlock::iterator EndMBBI = MBB.end();
1141  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1142  // Skip debug values.
1143  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
1144    ++NextMBBI;
1145  if (NextMBBI == EndMBBI)
1146    return EndMBBI;
1147
1148  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
1149  return Offset == 0 ? EndMBBI : NextMBBI;
1150}
1151
1152/// Fold proceeding/trailing inc/dec of base register into the
1153/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
1154///
1155/// stmia rn, <ra, rb, rc>
1156/// rn := rn + 4 * 3;
1157/// =>
1158/// stmia rn!, <ra, rb, rc>
1159///
1160/// rn := rn - 4 * 3;
1161/// ldmia rn, <ra, rb, rc>
1162/// =>
1163/// ldmdb rn!, <ra, rb, rc>
1164bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
1165  // Thumb1 is already using updating loads/stores.
1166  if (isThumb1) return false;
1167
1168  const MachineOperand &BaseOP = MI->getOperand(0);
1169  unsigned Base = BaseOP.getReg();
1170  bool BaseKill = BaseOP.isKill();
1171  unsigned PredReg = 0;
1172  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1173  unsigned Opcode = MI->getOpcode();
1174  DebugLoc DL = MI->getDebugLoc();
1175
1176  // Can't use an updating ld/st if the base register is also a dest
1177  // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
1178  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
1179    if (MI->getOperand(i).getReg() == Base)
1180      return false;
1181
1182  int Bytes = getLSMultipleTransferSize(MI);
1183  MachineBasicBlock &MBB = *MI->getParent();
1184  MachineBasicBlock::iterator MBBI(MI);
1185  int Offset;
1186  MachineBasicBlock::iterator MergeInstr
1187    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1188  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
1189  if (Mode == ARM_AM::ia && Offset == -Bytes) {
1190    Mode = ARM_AM::db;
1191  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
1192    Mode = ARM_AM::da;
1193  } else {
1194    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1195    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
1196        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
1197      return false;
1198  }
1199  MBB.erase(MergeInstr);
1200
1201  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
1202  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1203    .addReg(Base, getDefRegState(true)) // WB base register
1204    .addReg(Base, getKillRegState(BaseKill))
1205    .addImm(Pred).addReg(PredReg);
1206
1207  // Transfer the rest of operands.
1208  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
1209    MIB.addOperand(MI->getOperand(OpNum));
1210
1211  // Transfer memoperands.
1212  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
1213
1214  MBB.erase(MBBI);
1215  return true;
1216}
1217
1218static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
1219                                             ARM_AM::AddrOpc Mode) {
1220  switch (Opc) {
1221  case ARM::LDRi12:
1222    return ARM::LDR_PRE_IMM;
1223  case ARM::STRi12:
1224    return ARM::STR_PRE_IMM;
1225  case ARM::VLDRS:
1226    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1227  case ARM::VLDRD:
1228    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1229  case ARM::VSTRS:
1230    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1231  case ARM::VSTRD:
1232    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1233  case ARM::t2LDRi8:
1234  case ARM::t2LDRi12:
1235    return ARM::t2LDR_PRE;
1236  case ARM::t2STRi8:
1237  case ARM::t2STRi12:
1238    return ARM::t2STR_PRE;
1239  default: llvm_unreachable("Unhandled opcode!");
1240  }
1241}
1242
1243static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
1244                                              ARM_AM::AddrOpc Mode) {
1245  switch (Opc) {
1246  case ARM::LDRi12:
1247    return ARM::LDR_POST_IMM;
1248  case ARM::STRi12:
1249    return ARM::STR_POST_IMM;
1250  case ARM::VLDRS:
1251    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1252  case ARM::VLDRD:
1253    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1254  case ARM::VSTRS:
1255    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1256  case ARM::VSTRD:
1257    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1258  case ARM::t2LDRi8:
1259  case ARM::t2LDRi12:
1260    return ARM::t2LDR_POST;
1261  case ARM::t2STRi8:
1262  case ARM::t2STRi12:
1263    return ARM::t2STR_POST;
1264  default: llvm_unreachable("Unhandled opcode!");
1265  }
1266}
1267
1268/// Fold proceeding/trailing inc/dec of base register into the
1269/// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
1270bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
1271  // Thumb1 doesn't have updating LDR/STR.
1272  // FIXME: Use LDM/STM with single register instead.
1273  if (isThumb1) return false;
1274
1275  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
1276  bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
1277  unsigned Opcode = MI->getOpcode();
1278  DebugLoc DL = MI->getDebugLoc();
1279  bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
1280                Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
1281  bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
1282  if (isi32Load(Opcode) || isi32Store(Opcode))
1283    if (MI->getOperand(2).getImm() != 0)
1284      return false;
1285  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
1286    return false;
1287
1288  // Can't do the merge if the destination register is the same as the would-be
1289  // writeback register.
1290  if (MI->getOperand(0).getReg() == Base)
1291    return false;
1292
1293  unsigned PredReg = 0;
1294  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1295  int Bytes = getLSMultipleTransferSize(MI);
1296  MachineBasicBlock &MBB = *MI->getParent();
1297  MachineBasicBlock::iterator MBBI(MI);
1298  int Offset;
1299  MachineBasicBlock::iterator MergeInstr
1300    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1301  unsigned NewOpc;
1302  if (!isAM5 && Offset == Bytes) {
1303    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1304  } else if (Offset == -Bytes) {
1305    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1306  } else {
1307    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1308    if (Offset == Bytes) {
1309      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1310    } else if (!isAM5 && Offset == -Bytes) {
1311      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1312    } else
1313      return false;
1314  }
1315  MBB.erase(MergeInstr);
1316
1317  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
1318
1319  bool isLd = isLoadSingle(Opcode);
1320  if (isAM5) {
1321    // VLDM[SD]_UPD, VSTM[SD]_UPD
1322    // (There are no base-updating versions of VLDR/VSTR instructions, but the
1323    // updating load/store-multiple instructions can be used with only one
1324    // register.)
1325    MachineOperand &MO = MI->getOperand(0);
1326    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1327      .addReg(Base, getDefRegState(true)) // WB base register
1328      .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1329      .addImm(Pred).addReg(PredReg)
1330      .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
1331                            getKillRegState(MO.isKill())));
1332  } else if (isLd) {
1333    if (isAM2) {
1334      // LDR_PRE, LDR_POST
1335      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1336        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1337          .addReg(Base, RegState::Define)
1338          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1339      } else {
1340        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1341        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1342          .addReg(Base, RegState::Define)
1343          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1344      }
1345    } else {
1346      // t2LDR_PRE, t2LDR_POST
1347      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1348        .addReg(Base, RegState::Define)
1349        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1350    }
1351  } else {
1352    MachineOperand &MO = MI->getOperand(0);
1353    // FIXME: post-indexed stores use am2offset_imm, which still encodes
1354    // the vestigal zero-reg offset register. When that's fixed, this clause
1355    // can be removed entirely.
1356    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1357      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1358      // STR_PRE, STR_POST
1359      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1360        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1361        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1362    } else {
1363      // t2STR_PRE, t2STR_POST
1364      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1365        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1366        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1367    }
1368  }
1369  MBB.erase(MBBI);
1370
1371  return true;
1372}
1373
1374bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
1375  unsigned Opcode = MI.getOpcode();
1376  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
1377         "Must have t2STRDi8 or t2LDRDi8");
1378  if (MI.getOperand(3).getImm() != 0)
1379    return false;
1380
1381  // Behaviour for writeback is undefined if base register is the same as one
1382  // of the others.
1383  const MachineOperand &BaseOp = MI.getOperand(2);
1384  unsigned Base = BaseOp.getReg();
1385  const MachineOperand &Reg0Op = MI.getOperand(0);
1386  const MachineOperand &Reg1Op = MI.getOperand(1);
1387  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
1388    return false;
1389
1390  unsigned PredReg;
1391  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
1392  MachineBasicBlock::iterator MBBI(MI);
1393  MachineBasicBlock &MBB = *MI.getParent();
1394  int Offset;
1395  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
1396                                                            PredReg, Offset);
1397  unsigned NewOpc;
1398  if (Offset == 8 || Offset == -8) {
1399    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
1400  } else {
1401    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1402    if (Offset == 8 || Offset == -8) {
1403      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
1404    } else
1405      return false;
1406  }
1407  MBB.erase(MergeInstr);
1408
1409  DebugLoc DL = MI.getDebugLoc();
1410  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1411  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
1412    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
1413       .addReg(BaseOp.getReg(), RegState::Define);
1414  } else {
1415    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
1416    MIB.addReg(BaseOp.getReg(), RegState::Define)
1417       .addOperand(Reg0Op).addOperand(Reg1Op);
1418  }
1419  MIB.addReg(BaseOp.getReg(), RegState::Kill)
1420     .addImm(Offset).addImm(Pred).addReg(PredReg);
1421  assert(TII->get(Opcode).getNumOperands() == 6 &&
1422         TII->get(NewOpc).getNumOperands() == 7 &&
1423         "Unexpected number of operands in Opcode specification.");
1424
1425  // Transfer implicit operands.
1426  for (const MachineOperand &MO : MI.implicit_operands())
1427    MIB.addOperand(MO);
1428  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
1429
1430  MBB.erase(MBBI);
1431  return true;
1432}
1433
1434/// Returns true if instruction is a memory operation that this pass is capable
1435/// of operating on.
1436static bool isMemoryOp(const MachineInstr &MI) {
1437  unsigned Opcode = MI.getOpcode();
1438  switch (Opcode) {
1439  case ARM::VLDRS:
1440  case ARM::VSTRS:
1441  case ARM::VLDRD:
1442  case ARM::VSTRD:
1443  case ARM::LDRi12:
1444  case ARM::STRi12:
1445  case ARM::tLDRi:
1446  case ARM::tSTRi:
1447  case ARM::tLDRspi:
1448  case ARM::tSTRspi:
1449  case ARM::t2LDRi8:
1450  case ARM::t2LDRi12:
1451  case ARM::t2STRi8:
1452  case ARM::t2STRi12:
1453    break;
1454  default:
1455    return false;
1456  }
1457  if (!MI.getOperand(1).isReg())
1458    return false;
1459
1460  // When no memory operands are present, conservatively assume unaligned,
1461  // volatile, unfoldable.
1462  if (!MI.hasOneMemOperand())
1463    return false;
1464
1465  const MachineMemOperand &MMO = **MI.memoperands_begin();
1466
1467  // Don't touch volatile memory accesses - we may be changing their order.
1468  if (MMO.isVolatile())
1469    return false;
1470
1471  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1472  // not.
1473  if (MMO.getAlignment() < 4)
1474    return false;
1475
1476  // str <undef> could probably be eliminated entirely, but for now we just want
1477  // to avoid making a mess of it.
1478  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1479  if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
1480    return false;
1481
1482  // Likewise don't mess with references to undefined addresses.
1483  if (MI.getOperand(1).isUndef())
1484    return false;
1485
1486  return true;
1487}
1488
1489static void InsertLDR_STR(MachineBasicBlock &MBB,
1490                          MachineBasicBlock::iterator &MBBI,
1491                          int Offset, bool isDef,
1492                          DebugLoc DL, unsigned NewOpc,
1493                          unsigned Reg, bool RegDeadKill, bool RegUndef,
1494                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
1495                          bool OffKill, bool OffUndef,
1496                          ARMCC::CondCodes Pred, unsigned PredReg,
1497                          const TargetInstrInfo *TII, bool isT2) {
1498  if (isDef) {
1499    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1500                                      TII->get(NewOpc))
1501      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1502      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1503    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1504  } else {
1505    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1506                                      TII->get(NewOpc))
1507      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1508      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1509    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1510  }
1511}
1512
1513bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1514                                          MachineBasicBlock::iterator &MBBI) {
1515  MachineInstr *MI = &*MBBI;
1516  unsigned Opcode = MI->getOpcode();
1517  if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
1518    return false;
1519
1520  const MachineOperand &BaseOp = MI->getOperand(2);
1521  unsigned BaseReg = BaseOp.getReg();
1522  unsigned EvenReg = MI->getOperand(0).getReg();
1523  unsigned OddReg  = MI->getOperand(1).getReg();
1524  unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1525  unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
1526
1527  // ARM errata 602117: LDRD with base in list may result in incorrect base
1528  // register when interrupted or faulted.
1529  bool Errata602117 = EvenReg == BaseReg &&
1530    (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
1531  // ARM LDRD/STRD needs consecutive registers.
1532  bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
1533    (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
1534
1535  if (!Errata602117 && !NonConsecutiveRegs)
1536    return false;
1537
1538  bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1539  bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1540  bool EvenDeadKill = isLd ?
1541    MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1542  bool EvenUndef = MI->getOperand(0).isUndef();
1543  bool OddDeadKill  = isLd ?
1544    MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1545  bool OddUndef = MI->getOperand(1).isUndef();
1546  bool BaseKill = BaseOp.isKill();
1547  bool BaseUndef = BaseOp.isUndef();
1548  bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1549  bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1550  int OffImm = getMemoryOpOffset(MI);
1551  unsigned PredReg = 0;
1552  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1553
1554  if (OddRegNum > EvenRegNum && OffImm == 0) {
1555    // Ascending register numbers and no offset. It's safe to change it to a
1556    // ldm or stm.
1557    unsigned NewOpc = (isLd)
1558      ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1559      : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1560    if (isLd) {
1561      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1562        .addReg(BaseReg, getKillRegState(BaseKill))
1563        .addImm(Pred).addReg(PredReg)
1564        .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1565        .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1566      ++NumLDRD2LDM;
1567    } else {
1568      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1569        .addReg(BaseReg, getKillRegState(BaseKill))
1570        .addImm(Pred).addReg(PredReg)
1571        .addReg(EvenReg,
1572                getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1573        .addReg(OddReg,
1574                getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
1575      ++NumSTRD2STM;
1576    }
1577  } else {
1578    // Split into two instructions.
1579    unsigned NewOpc = (isLd)
1580      ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1581      : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1582    // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1583    // so adjust and use t2LDRi12 here for that.
1584    unsigned NewOpc2 = (isLd)
1585      ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1586      : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1587    DebugLoc dl = MBBI->getDebugLoc();
1588    // If this is a load and base register is killed, it may have been
1589    // re-defed by the load, make sure the first load does not clobber it.
1590    if (isLd &&
1591        (BaseKill || OffKill) &&
1592        (TRI->regsOverlap(EvenReg, BaseReg))) {
1593      assert(!TRI->regsOverlap(OddReg, BaseReg));
1594      InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1595                    OddReg, OddDeadKill, false,
1596                    BaseReg, false, BaseUndef, false, OffUndef,
1597                    Pred, PredReg, TII, isT2);
1598      InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1599                    EvenReg, EvenDeadKill, false,
1600                    BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1601                    Pred, PredReg, TII, isT2);
1602    } else {
1603      if (OddReg == EvenReg && EvenDeadKill) {
1604        // If the two source operands are the same, the kill marker is
1605        // probably on the first one. e.g.
1606        // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1607        EvenDeadKill = false;
1608        OddDeadKill = true;
1609      }
1610      // Never kill the base register in the first instruction.
1611      if (EvenReg == BaseReg)
1612        EvenDeadKill = false;
1613      InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1614                    EvenReg, EvenDeadKill, EvenUndef,
1615                    BaseReg, false, BaseUndef, false, OffUndef,
1616                    Pred, PredReg, TII, isT2);
1617      InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1618                    OddReg, OddDeadKill, OddUndef,
1619                    BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1620                    Pred, PredReg, TII, isT2);
1621    }
1622    if (isLd)
1623      ++NumLDRD2LDR;
1624    else
1625      ++NumSTRD2STR;
1626  }
1627
1628  MBBI = MBB.erase(MBBI);
1629  return true;
1630}
1631
1632/// An optimization pass to turn multiple LDR / STR ops of the same base and
1633/// incrementing offset into LDM / STM ops.
1634bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1635  MemOpQueue MemOps;
1636  unsigned CurrBase = 0;
1637  unsigned CurrOpc = ~0u;
1638  ARMCC::CondCodes CurrPred = ARMCC::AL;
1639  unsigned Position = 0;
1640  assert(Candidates.size() == 0);
1641  assert(MergeBaseCandidates.size() == 0);
1642  LiveRegsValid = false;
1643
1644  for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
1645       I = MBBI) {
1646    // The instruction in front of the iterator is the one we look at.
1647    MBBI = std::prev(I);
1648    if (FixInvalidRegPairOp(MBB, MBBI))
1649      continue;
1650    ++Position;
1651
1652    if (isMemoryOp(*MBBI)) {
1653      unsigned Opcode = MBBI->getOpcode();
1654      const MachineOperand &MO = MBBI->getOperand(0);
1655      unsigned Reg = MO.getReg();
1656      unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
1657      unsigned PredReg = 0;
1658      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
1659      int Offset = getMemoryOpOffset(MBBI);
1660      if (CurrBase == 0) {
1661        // Start of a new chain.
1662        CurrBase = Base;
1663        CurrOpc  = Opcode;
1664        CurrPred = Pred;
1665        MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1666        continue;
1667      }
1668      // Note: No need to match PredReg in the next if.
1669      if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1670        // Watch out for:
1671        //   r4 := ldr [r0, #8]
1672        //   r4 := ldr [r0, #4]
1673        // or
1674        //   r0 := ldr [r0]
1675        // If a load overrides the base register or a register loaded by
1676        // another load in our chain, we cannot take this instruction.
1677        bool Overlap = false;
1678        if (isLoadSingle(Opcode)) {
1679          Overlap = (Base == Reg);
1680          if (!Overlap) {
1681            for (const MemOpQueueEntry &E : MemOps) {
1682              if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
1683                Overlap = true;
1684                break;
1685              }
1686            }
1687          }
1688        }
1689
1690        if (!Overlap) {
1691          // Check offset and sort memory operation into the current chain.
1692          if (Offset > MemOps.back().Offset) {
1693            MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1694            continue;
1695          } else {
1696            MemOpQueue::iterator MI, ME;
1697            for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
1698              if (Offset < MI->Offset) {
1699                // Found a place to insert.
1700                break;
1701              }
1702              if (Offset == MI->Offset) {
1703                // Collision, abort.
1704                MI = ME;
1705                break;
1706              }
1707            }
1708            if (MI != MemOps.end()) {
1709              MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
1710              continue;
1711            }
1712          }
1713        }
1714      }
1715
1716      // Don't advance the iterator; The op will start a new chain next.
1717      MBBI = I;
1718      --Position;
1719      // Fallthrough to look into existing chain.
1720    } else if (MBBI->isDebugValue()) {
1721      continue;
1722    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
1723               MBBI->getOpcode() == ARM::t2STRDi8) {
1724      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
1725      // remember them because we may still be able to merge add/sub into them.
1726      MergeBaseCandidates.push_back(MBBI);
1727    }
1728
1729
1730    // If we are here then the chain is broken; Extract candidates for a merge.
1731    if (MemOps.size() > 0) {
1732      FormCandidates(MemOps);
1733      // Reset for the next chain.
1734      CurrBase = 0;
1735      CurrOpc = ~0u;
1736      CurrPred = ARMCC::AL;
1737      MemOps.clear();
1738    }
1739  }
1740  if (MemOps.size() > 0)
1741    FormCandidates(MemOps);
1742
1743  // Sort candidates so they get processed from end to begin of the basic
1744  // block later; This is necessary for liveness calculation.
1745  auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
1746    return M0->InsertPos < M1->InsertPos;
1747  };
1748  std::sort(Candidates.begin(), Candidates.end(), LessThan);
1749
1750  // Go through list of candidates and merge.
1751  bool Changed = false;
1752  for (const MergeCandidate *Candidate : Candidates) {
1753    if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
1754      MachineInstr *Merged = MergeOpsUpdate(*Candidate);
1755      // Merge preceding/trailing base inc/dec into the merged op.
1756      if (Merged) {
1757        Changed = true;
1758        unsigned Opcode = Merged->getOpcode();
1759        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
1760          MergeBaseUpdateLSDouble(*Merged);
1761        else
1762          MergeBaseUpdateLSMultiple(Merged);
1763      } else {
1764        for (MachineInstr *MI : Candidate->Instrs) {
1765          if (MergeBaseUpdateLoadStore(MI))
1766            Changed = true;
1767        }
1768      }
1769    } else {
1770      assert(Candidate->Instrs.size() == 1);
1771      if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
1772        Changed = true;
1773    }
1774  }
1775  Candidates.clear();
1776  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
1777  for (MachineInstr *MI : MergeBaseCandidates)
1778    MergeBaseUpdateLSDouble(*MI);
1779  MergeBaseCandidates.clear();
1780
1781  return Changed;
1782}
1783
1784/// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
1785/// into the preceding stack restore so it directly restore the value of LR
1786/// into pc.
1787///   ldmfd sp!, {..., lr}
1788///   bx lr
1789/// or
1790///   ldmfd sp!, {..., lr}
1791///   mov pc, lr
1792/// =>
1793///   ldmfd sp!, {..., pc}
1794bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1795  // Thumb1 LDM doesn't allow high registers.
1796  if (isThumb1) return false;
1797  if (MBB.empty()) return false;
1798
1799  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1800  if (MBBI != MBB.begin() &&
1801      (MBBI->getOpcode() == ARM::BX_RET ||
1802       MBBI->getOpcode() == ARM::tBX_RET ||
1803       MBBI->getOpcode() == ARM::MOVPCLR)) {
1804    MachineBasicBlock::iterator PrevI = std::prev(MBBI);
1805    // Ignore any DBG_VALUE instructions.
1806    while (PrevI->isDebugValue() && PrevI != MBB.begin())
1807      --PrevI;
1808    MachineInstr *PrevMI = PrevI;
1809    unsigned Opcode = PrevMI->getOpcode();
1810    if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1811        Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1812        Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1813      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1814      if (MO.getReg() != ARM::LR)
1815        return false;
1816      unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1817      assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1818              Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1819      PrevMI->setDesc(TII->get(NewOpc));
1820      MO.setReg(ARM::PC);
1821      PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
1822      MBB.erase(MBBI);
1823      return true;
1824    }
1825  }
1826  return false;
1827}
1828
1829bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
1830  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1831  if (MBBI == MBB.begin() || MBBI == MBB.end() ||
1832      MBBI->getOpcode() != ARM::tBX_RET)
1833    return false;
1834
1835  MachineBasicBlock::iterator Prev = MBBI;
1836  --Prev;
1837  if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
1838    return false;
1839
1840  for (auto Use : Prev->uses())
1841    if (Use.isKill()) {
1842      AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
1843          .addReg(Use.getReg(), RegState::Kill))
1844          .copyImplicitOps(&*MBBI);
1845      MBB.erase(MBBI);
1846      MBB.erase(Prev);
1847      return true;
1848    }
1849
1850  llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
1851}
1852
1853bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1854  MF = &Fn;
1855  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1856  TL = STI->getTargetLowering();
1857  AFI = Fn.getInfo<ARMFunctionInfo>();
1858  TII = STI->getInstrInfo();
1859  TRI = STI->getRegisterInfo();
1860
1861  RegClassInfoValid = false;
1862  isThumb2 = AFI->isThumb2Function();
1863  isThumb1 = AFI->isThumbFunction() && !isThumb2;
1864
1865  bool Modified = false;
1866  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1867       ++MFI) {
1868    MachineBasicBlock &MBB = *MFI;
1869    Modified |= LoadStoreMultipleOpti(MBB);
1870    if (STI->hasV5TOps())
1871      Modified |= MergeReturnIntoLDM(MBB);
1872    if (isThumb1)
1873      Modified |= CombineMovBx(MBB);
1874  }
1875
1876  Allocator.DestroyAll();
1877  return Modified;
1878}
1879
1880namespace llvm {
1881void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
1882}
1883
1884#define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
1885  "ARM pre- register allocation load / store optimization pass"
1886
1887namespace {
1888  /// Pre- register allocation pass that move load / stores from consecutive
1889  /// locations close to make it more likely they will be combined later.
1890  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1891    static char ID;
1892    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {
1893      initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry());
1894    }
1895
1896    const DataLayout *TD;
1897    const TargetInstrInfo *TII;
1898    const TargetRegisterInfo *TRI;
1899    const ARMSubtarget *STI;
1900    MachineRegisterInfo *MRI;
1901    MachineFunction *MF;
1902
1903    bool runOnMachineFunction(MachineFunction &Fn) override;
1904
1905    const char *getPassName() const override {
1906      return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
1907    }
1908
1909  private:
1910    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1911                          unsigned &NewOpc, unsigned &EvenReg,
1912                          unsigned &OddReg, unsigned &BaseReg,
1913                          int &Offset,
1914                          unsigned &PredReg, ARMCC::CondCodes &Pred,
1915                          bool &isT2);
1916    bool RescheduleOps(MachineBasicBlock *MBB,
1917                       SmallVectorImpl<MachineInstr *> &Ops,
1918                       unsigned Base, bool isLd,
1919                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1920    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1921  };
1922  char ARMPreAllocLoadStoreOpt::ID = 0;
1923}
1924
1925INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt",
1926                ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
1927
1928bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1929  TD = &Fn.getDataLayout();
1930  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1931  TII = STI->getInstrInfo();
1932  TRI = STI->getRegisterInfo();
1933  MRI = &Fn.getRegInfo();
1934  MF  = &Fn;
1935
1936  bool Modified = false;
1937  for (MachineBasicBlock &MFI : Fn)
1938    Modified |= RescheduleLoadStoreInstrs(&MFI);
1939
1940  return Modified;
1941}
1942
1943static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1944                                      MachineBasicBlock::iterator I,
1945                                      MachineBasicBlock::iterator E,
1946                                      SmallPtrSetImpl<MachineInstr*> &MemOps,
1947                                      SmallSet<unsigned, 4> &MemRegs,
1948                                      const TargetRegisterInfo *TRI) {
1949  // Are there stores / loads / calls between them?
1950  // FIXME: This is overly conservative. We should make use of alias information
1951  // some day.
1952  SmallSet<unsigned, 4> AddedRegPressure;
1953  while (++I != E) {
1954    if (I->isDebugValue() || MemOps.count(&*I))
1955      continue;
1956    if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
1957      return false;
1958    if (isLd && I->mayStore())
1959      return false;
1960    if (!isLd) {
1961      if (I->mayLoad())
1962        return false;
1963      // It's not safe to move the first 'str' down.
1964      // str r1, [r0]
1965      // strh r5, [r0]
1966      // str r4, [r0, #+4]
1967      if (I->mayStore())
1968        return false;
1969    }
1970    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1971      MachineOperand &MO = I->getOperand(j);
1972      if (!MO.isReg())
1973        continue;
1974      unsigned Reg = MO.getReg();
1975      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1976        return false;
1977      if (Reg != Base && !MemRegs.count(Reg))
1978        AddedRegPressure.insert(Reg);
1979    }
1980  }
1981
1982  // Estimate register pressure increase due to the transformation.
1983  if (MemRegs.size() <= 4)
1984    // Ok if we are moving small number of instructions.
1985    return true;
1986  return AddedRegPressure.size() <= MemRegs.size() * 2;
1987}
1988
1989
1990/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
1991static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
1992                                   MachineInstr *Op1) {
1993  assert(MI->memoperands_empty() && "expected a new machineinstr");
1994  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
1995    + (Op1->memoperands_end() - Op1->memoperands_begin());
1996
1997  MachineFunction *MF = MI->getParent()->getParent();
1998  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
1999  MachineSDNode::mmo_iterator MemEnd =
2000    std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
2001  MemEnd =
2002    std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
2003  MI->setMemRefs(MemBegin, MemEnd);
2004}
2005
2006bool
2007ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
2008                                          DebugLoc &dl, unsigned &NewOpc,
2009                                          unsigned &FirstReg,
2010                                          unsigned &SecondReg,
2011                                          unsigned &BaseReg, int &Offset,
2012                                          unsigned &PredReg,
2013                                          ARMCC::CondCodes &Pred,
2014                                          bool &isT2) {
2015  // Make sure we're allowed to generate LDRD/STRD.
2016  if (!STI->hasV5TEOps())
2017    return false;
2018
2019  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
2020  unsigned Scale = 1;
2021  unsigned Opcode = Op0->getOpcode();
2022  if (Opcode == ARM::LDRi12) {
2023    NewOpc = ARM::LDRD;
2024  } else if (Opcode == ARM::STRi12) {
2025    NewOpc = ARM::STRD;
2026  } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
2027    NewOpc = ARM::t2LDRDi8;
2028    Scale = 4;
2029    isT2 = true;
2030  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
2031    NewOpc = ARM::t2STRDi8;
2032    Scale = 4;
2033    isT2 = true;
2034  } else {
2035    return false;
2036  }
2037
2038  // Make sure the base address satisfies i64 ld / st alignment requirement.
2039  // At the moment, we ignore the memoryoperand's value.
2040  // If we want to use AliasAnalysis, we should check it accordingly.
2041  if (!Op0->hasOneMemOperand() ||
2042      (*Op0->memoperands_begin())->isVolatile())
2043    return false;
2044
2045  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
2046  const Function *Func = MF->getFunction();
2047  unsigned ReqAlign = STI->hasV6Ops()
2048    ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
2049    : 8;  // Pre-v6 need 8-byte align
2050  if (Align < ReqAlign)
2051    return false;
2052
2053  // Then make sure the immediate offset fits.
2054  int OffImm = getMemoryOpOffset(Op0);
2055  if (isT2) {
2056    int Limit = (1 << 8) * Scale;
2057    if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
2058      return false;
2059    Offset = OffImm;
2060  } else {
2061    ARM_AM::AddrOpc AddSub = ARM_AM::add;
2062    if (OffImm < 0) {
2063      AddSub = ARM_AM::sub;
2064      OffImm = - OffImm;
2065    }
2066    int Limit = (1 << 8) * Scale;
2067    if (OffImm >= Limit || (OffImm & (Scale-1)))
2068      return false;
2069    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
2070  }
2071  FirstReg = Op0->getOperand(0).getReg();
2072  SecondReg = Op1->getOperand(0).getReg();
2073  if (FirstReg == SecondReg)
2074    return false;
2075  BaseReg = Op0->getOperand(1).getReg();
2076  Pred = getInstrPredicate(Op0, PredReg);
2077  dl = Op0->getDebugLoc();
2078  return true;
2079}
2080
2081bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
2082                                 SmallVectorImpl<MachineInstr *> &Ops,
2083                                 unsigned Base, bool isLd,
2084                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
2085  bool RetVal = false;
2086
2087  // Sort by offset (in reverse order).
2088  std::sort(Ops.begin(), Ops.end(),
2089            [](const MachineInstr *LHS, const MachineInstr *RHS) {
2090    int LOffset = getMemoryOpOffset(LHS);
2091    int ROffset = getMemoryOpOffset(RHS);
2092    assert(LHS == RHS || LOffset != ROffset);
2093    return LOffset > ROffset;
2094  });
2095
2096  // The loads / stores of the same base are in order. Scan them from first to
2097  // last and check for the following:
2098  // 1. Any def of base.
2099  // 2. Any gaps.
2100  while (Ops.size() > 1) {
2101    unsigned FirstLoc = ~0U;
2102    unsigned LastLoc = 0;
2103    MachineInstr *FirstOp = nullptr;
2104    MachineInstr *LastOp = nullptr;
2105    int LastOffset = 0;
2106    unsigned LastOpcode = 0;
2107    unsigned LastBytes = 0;
2108    unsigned NumMove = 0;
2109    for (int i = Ops.size() - 1; i >= 0; --i) {
2110      MachineInstr *Op = Ops[i];
2111      unsigned Loc = MI2LocMap[Op];
2112      if (Loc <= FirstLoc) {
2113        FirstLoc = Loc;
2114        FirstOp = Op;
2115      }
2116      if (Loc >= LastLoc) {
2117        LastLoc = Loc;
2118        LastOp = Op;
2119      }
2120
2121      unsigned LSMOpcode
2122        = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
2123      if (LastOpcode && LSMOpcode != LastOpcode)
2124        break;
2125
2126      int Offset = getMemoryOpOffset(Op);
2127      unsigned Bytes = getLSMultipleTransferSize(Op);
2128      if (LastBytes) {
2129        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
2130          break;
2131      }
2132      LastOffset = Offset;
2133      LastBytes = Bytes;
2134      LastOpcode = LSMOpcode;
2135      if (++NumMove == 8) // FIXME: Tune this limit.
2136        break;
2137    }
2138
2139    if (NumMove <= 1)
2140      Ops.pop_back();
2141    else {
2142      SmallPtrSet<MachineInstr*, 4> MemOps;
2143      SmallSet<unsigned, 4> MemRegs;
2144      for (int i = NumMove-1; i >= 0; --i) {
2145        MemOps.insert(Ops[i]);
2146        MemRegs.insert(Ops[i]->getOperand(0).getReg());
2147      }
2148
2149      // Be conservative, if the instructions are too far apart, don't
2150      // move them. We want to limit the increase of register pressure.
2151      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
2152      if (DoMove)
2153        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
2154                                           MemOps, MemRegs, TRI);
2155      if (!DoMove) {
2156        for (unsigned i = 0; i != NumMove; ++i)
2157          Ops.pop_back();
2158      } else {
2159        // This is the new location for the loads / stores.
2160        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
2161        while (InsertPos != MBB->end()
2162               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
2163          ++InsertPos;
2164
2165        // If we are moving a pair of loads / stores, see if it makes sense
2166        // to try to allocate a pair of registers that can form register pairs.
2167        MachineInstr *Op0 = Ops.back();
2168        MachineInstr *Op1 = Ops[Ops.size()-2];
2169        unsigned FirstReg = 0, SecondReg = 0;
2170        unsigned BaseReg = 0, PredReg = 0;
2171        ARMCC::CondCodes Pred = ARMCC::AL;
2172        bool isT2 = false;
2173        unsigned NewOpc = 0;
2174        int Offset = 0;
2175        DebugLoc dl;
2176        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
2177                                             FirstReg, SecondReg, BaseReg,
2178                                             Offset, PredReg, Pred, isT2)) {
2179          Ops.pop_back();
2180          Ops.pop_back();
2181
2182          const MCInstrDesc &MCID = TII->get(NewOpc);
2183          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2184          MRI->constrainRegClass(FirstReg, TRC);
2185          MRI->constrainRegClass(SecondReg, TRC);
2186
2187          // Form the pair instruction.
2188          if (isLd) {
2189            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2190              .addReg(FirstReg, RegState::Define)
2191              .addReg(SecondReg, RegState::Define)
2192              .addReg(BaseReg);
2193            // FIXME: We're converting from LDRi12 to an insn that still
2194            // uses addrmode2, so we need an explicit offset reg. It should
2195            // always by reg0 since we're transforming LDRi12s.
2196            if (!isT2)
2197              MIB.addReg(0);
2198            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2199            concatenateMemOperands(MIB, Op0, Op1);
2200            DEBUG(dbgs() << "Formed " << *MIB << "\n");
2201            ++NumLDRDFormed;
2202          } else {
2203            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2204              .addReg(FirstReg)
2205              .addReg(SecondReg)
2206              .addReg(BaseReg);
2207            // FIXME: We're converting from LDRi12 to an insn that still
2208            // uses addrmode2, so we need an explicit offset reg. It should
2209            // always by reg0 since we're transforming STRi12s.
2210            if (!isT2)
2211              MIB.addReg(0);
2212            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2213            concatenateMemOperands(MIB, Op0, Op1);
2214            DEBUG(dbgs() << "Formed " << *MIB << "\n");
2215            ++NumSTRDFormed;
2216          }
2217          MBB->erase(Op0);
2218          MBB->erase(Op1);
2219
2220          if (!isT2) {
2221            // Add register allocation hints to form register pairs.
2222            MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
2223            MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
2224          }
2225        } else {
2226          for (unsigned i = 0; i != NumMove; ++i) {
2227            MachineInstr *Op = Ops.back();
2228            Ops.pop_back();
2229            MBB->splice(InsertPos, MBB, Op);
2230          }
2231        }
2232
2233        NumLdStMoved += NumMove;
2234        RetVal = true;
2235      }
2236    }
2237  }
2238
2239  return RetVal;
2240}
2241
2242bool
2243ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
2244  bool RetVal = false;
2245
2246  DenseMap<MachineInstr*, unsigned> MI2LocMap;
2247  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
2248  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
2249  SmallVector<unsigned, 4> LdBases;
2250  SmallVector<unsigned, 4> StBases;
2251
2252  unsigned Loc = 0;
2253  MachineBasicBlock::iterator MBBI = MBB->begin();
2254  MachineBasicBlock::iterator E = MBB->end();
2255  while (MBBI != E) {
2256    for (; MBBI != E; ++MBBI) {
2257      MachineInstr *MI = MBBI;
2258      if (MI->isCall() || MI->isTerminator()) {
2259        // Stop at barriers.
2260        ++MBBI;
2261        break;
2262      }
2263
2264      if (!MI->isDebugValue())
2265        MI2LocMap[MI] = ++Loc;
2266
2267      if (!isMemoryOp(*MI))
2268        continue;
2269      unsigned PredReg = 0;
2270      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
2271        continue;
2272
2273      int Opc = MI->getOpcode();
2274      bool isLd = isLoadSingle(Opc);
2275      unsigned Base = MI->getOperand(1).getReg();
2276      int Offset = getMemoryOpOffset(MI);
2277
2278      bool StopHere = false;
2279      if (isLd) {
2280        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2281          Base2LdsMap.find(Base);
2282        if (BI != Base2LdsMap.end()) {
2283          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2284            if (Offset == getMemoryOpOffset(BI->second[i])) {
2285              StopHere = true;
2286              break;
2287            }
2288          }
2289          if (!StopHere)
2290            BI->second.push_back(MI);
2291        } else {
2292          Base2LdsMap[Base].push_back(MI);
2293          LdBases.push_back(Base);
2294        }
2295      } else {
2296        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2297          Base2StsMap.find(Base);
2298        if (BI != Base2StsMap.end()) {
2299          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2300            if (Offset == getMemoryOpOffset(BI->second[i])) {
2301              StopHere = true;
2302              break;
2303            }
2304          }
2305          if (!StopHere)
2306            BI->second.push_back(MI);
2307        } else {
2308          Base2StsMap[Base].push_back(MI);
2309          StBases.push_back(Base);
2310        }
2311      }
2312
2313      if (StopHere) {
2314        // Found a duplicate (a base+offset combination that's seen earlier).
2315        // Backtrack.
2316        --Loc;
2317        break;
2318      }
2319    }
2320
2321    // Re-schedule loads.
2322    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
2323      unsigned Base = LdBases[i];
2324      SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
2325      if (Lds.size() > 1)
2326        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
2327    }
2328
2329    // Re-schedule stores.
2330    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
2331      unsigned Base = StBases[i];
2332      SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
2333      if (Sts.size() > 1)
2334        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
2335    }
2336
2337    if (MBBI != E) {
2338      Base2LdsMap.clear();
2339      Base2StsMap.clear();
2340      LdBases.clear();
2341      StBases.clear();
2342    }
2343  }
2344
2345  return RetVal;
2346}
2347
2348
2349/// Returns an instance of the load / store optimization pass.
2350FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
2351  if (PreAlloc)
2352    return new ARMPreAllocLoadStoreOpt();
2353  return new ARMLoadStoreOpt();
2354}
2355
2356