ARMLoadStoreOptimizer.cpp revision b43a20e3bce6b4b16151ec25ef3541494fce8425
1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains a pass that performs load / store related peephole
11// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-ldst-opt"
16#include "ARM.h"
17#include "ARMAddressingModes.h"
18#include "ARMBaseInstrInfo.h"
19#include "ARMMachineFunctionInfo.h"
20#include "ARMRegisterInfo.h"
21#include "llvm/DerivedTypes.h"
22#include "llvm/Function.h"
23#include "llvm/CodeGen/MachineBasicBlock.h"
24#include "llvm/CodeGen/MachineFunctionPass.h"
25#include "llvm/CodeGen/MachineInstr.h"
26#include "llvm/CodeGen/MachineInstrBuilder.h"
27#include "llvm/CodeGen/MachineRegisterInfo.h"
28#include "llvm/CodeGen/RegisterScavenging.h"
29#include "llvm/Target/TargetData.h"
30#include "llvm/Target/TargetInstrInfo.h"
31#include "llvm/Target/TargetMachine.h"
32#include "llvm/Target/TargetRegisterInfo.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ErrorHandling.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/STLExtras.h"
37#include "llvm/ADT/SmallPtrSet.h"
38#include "llvm/ADT/SmallSet.h"
39#include "llvm/ADT/SmallVector.h"
40#include "llvm/ADT/Statistic.h"
41using namespace llvm;
42
43STATISTIC(NumLDMGened , "Number of ldm instructions generated");
44STATISTIC(NumSTMGened , "Number of stm instructions generated");
45STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
46STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
47STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
48STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
49STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
50STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
51STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
52STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
53STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
54
55/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
56/// load / store instructions to form ldm / stm instructions.
57
58namespace {
59  struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
60    static char ID;
61    ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
62
63    const TargetInstrInfo *TII;
64    const TargetRegisterInfo *TRI;
65    ARMFunctionInfo *AFI;
66    RegScavenger *RS;
67    bool isThumb2;
68
69    virtual bool runOnMachineFunction(MachineFunction &Fn);
70
71    virtual const char *getPassName() const {
72      return "ARM load / store optimization pass";
73    }
74
75  private:
76    struct MemOpQueueEntry {
77      int Offset;
78      unsigned Position;
79      MachineBasicBlock::iterator MBBI;
80      bool Merged;
81      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
82        : Offset(o), Position(p), MBBI(i), Merged(false) {};
83    };
84    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
85    typedef MemOpQueue::iterator MemOpQueueIter;
86
87    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
88                  int Offset, unsigned Base, bool BaseKill, int Opcode,
89                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
90                  DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
91    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
92                      int Opcode, unsigned Size,
93                      ARMCC::CondCodes Pred, unsigned PredReg,
94                      unsigned Scratch, MemOpQueue &MemOps,
95                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
96
97    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
98    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
99                             MachineBasicBlock::iterator &MBBI);
100    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
101                                  MachineBasicBlock::iterator MBBI,
102                                  const TargetInstrInfo *TII,
103                                  bool &Advance,
104                                  MachineBasicBlock::iterator &I);
105    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
106                                   MachineBasicBlock::iterator MBBI,
107                                   bool &Advance,
108                                   MachineBasicBlock::iterator &I);
109    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
110    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
111  };
112  char ARMLoadStoreOpt::ID = 0;
113}
114
115static int getLoadStoreMultipleOpcode(int Opcode) {
116  switch (Opcode) {
117  case ARM::LDR:
118    NumLDMGened++;
119    return ARM::LDM;
120  case ARM::STR:
121    NumSTMGened++;
122    return ARM::STM;
123  case ARM::t2LDRi8:
124  case ARM::t2LDRi12:
125    NumLDMGened++;
126    return ARM::t2LDM;
127  case ARM::t2STRi8:
128  case ARM::t2STRi12:
129    NumSTMGened++;
130    return ARM::t2STM;
131  case ARM::FLDS:
132    NumFLDMGened++;
133    return ARM::FLDMS;
134  case ARM::FSTS:
135    NumFSTMGened++;
136    return ARM::FSTMS;
137  case ARM::FLDD:
138    NumFLDMGened++;
139    return ARM::FLDMD;
140  case ARM::FSTD:
141    NumFSTMGened++;
142    return ARM::FSTMD;
143  default: llvm_unreachable("Unhandled opcode!");
144  }
145  return 0;
146}
147
148static bool isT2i32Load(unsigned Opc) {
149  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
150}
151
152static bool isi32Load(unsigned Opc) {
153  return Opc == ARM::LDR || isT2i32Load(Opc);
154}
155
156static bool isT2i32Store(unsigned Opc) {
157  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
158}
159
160static bool isi32Store(unsigned Opc) {
161  return Opc == ARM::STR || isT2i32Store(Opc);
162}
163
164/// MergeOps - Create and insert a LDM or STM with Base as base register and
165/// registers in Regs as the register operands that would be loaded / stored.
166/// It returns true if the transformation is done.
167bool
168ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
169                          MachineBasicBlock::iterator MBBI,
170                          int Offset, unsigned Base, bool BaseKill,
171                          int Opcode, ARMCC::CondCodes Pred,
172                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
173                          SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
174  // Only a single register to load / store. Don't bother.
175  unsigned NumRegs = Regs.size();
176  if (NumRegs <= 1)
177    return false;
178
179  ARM_AM::AMSubMode Mode = ARM_AM::ia;
180  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
181  if (isAM4 && Offset == 4) {
182    if (isThumb2)
183      // Thumb2 does not support ldmib / stmib.
184      return false;
185    Mode = ARM_AM::ib;
186  } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
187    if (isThumb2)
188      // Thumb2 does not support ldmda / stmda.
189      return false;
190    Mode = ARM_AM::da;
191  } else if (isAM4 && Offset == -4 * (int)NumRegs) {
192    Mode = ARM_AM::db;
193  } else if (Offset != 0) {
194    // If starting offset isn't zero, insert a MI to materialize a new base.
195    // But only do so if it is cost effective, i.e. merging more than two
196    // loads / stores.
197    if (NumRegs <= 2)
198      return false;
199
200    unsigned NewBase;
201    if (isi32Load(Opcode))
202      // If it is a load, then just use one of the destination register to
203      // use as the new base.
204      NewBase = Regs[NumRegs-1].first;
205    else {
206      // Use the scratch register to use as a new base.
207      NewBase = Scratch;
208      if (NewBase == 0)
209        return false;
210    }
211    int BaseOpc = !isThumb2
212      ? ARM::ADDri
213      : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
214    if (Offset < 0) {
215      BaseOpc = !isThumb2
216        ? ARM::SUBri
217        : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
218      Offset = - Offset;
219    }
220    int ImmedOffset = isThumb2
221      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
222    if (ImmedOffset == -1)
223      // FIXME: Try t2ADDri12 or t2SUBri12?
224      return false;  // Probably not worth it then.
225
226    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
227      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
228      .addImm(Pred).addReg(PredReg).addReg(0);
229    Base = NewBase;
230    BaseKill = true;  // New base is always killed right its use.
231  }
232
233  bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
234  bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
235  Opcode = getLoadStoreMultipleOpcode(Opcode);
236  MachineInstrBuilder MIB = (isAM4)
237    ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
238        .addReg(Base, getKillRegState(BaseKill))
239        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
240    : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
241        .addReg(Base, getKillRegState(BaseKill))
242        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
243        .addImm(Pred).addReg(PredReg);
244  MIB.addReg(0); // Add optional writeback (0 for now).
245  for (unsigned i = 0; i != NumRegs; ++i)
246    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
247                     | getKillRegState(Regs[i].second));
248
249  return true;
250}
251
252/// MergeLDR_STR - Merge a number of load / store instructions into one or more
253/// load / store multiple instructions.
254void
255ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
256                          unsigned Base, int Opcode, unsigned Size,
257                          ARMCC::CondCodes Pred, unsigned PredReg,
258                          unsigned Scratch, MemOpQueue &MemOps,
259                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
260  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
261  int Offset = MemOps[SIndex].Offset;
262  int SOffset = Offset;
263  unsigned Pos = MemOps[SIndex].Position;
264  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
265  DebugLoc dl = Loc->getDebugLoc();
266  unsigned PReg = Loc->getOperand(0).getReg();
267  unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
268  bool isKill = Loc->getOperand(0).isKill();
269
270  SmallVector<std::pair<unsigned,bool>, 8> Regs;
271  Regs.push_back(std::make_pair(PReg, isKill));
272  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
273    int NewOffset = MemOps[i].Offset;
274    unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg();
275    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
276    isKill = MemOps[i].MBBI->getOperand(0).isKill();
277    // AM4 - register numbers in ascending order.
278    // AM5 - consecutive register numbers in ascending order.
279    if (NewOffset == Offset + (int)Size &&
280        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
281      Offset += Size;
282      Regs.push_back(std::make_pair(Reg, isKill));
283      PRegNum = RegNum;
284    } else {
285      // Can't merge this in. Try merge the earlier ones first.
286      if (MergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
287                   Scratch, dl, Regs)) {
288        Merges.push_back(prior(Loc));
289        for (unsigned j = SIndex; j < i; ++j) {
290          MBB.erase(MemOps[j].MBBI);
291          MemOps[j].Merged = true;
292        }
293      }
294      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
295                   MemOps, Merges);
296      return;
297    }
298
299    if (MemOps[i].Position > Pos) {
300      Pos = MemOps[i].Position;
301      Loc = MemOps[i].MBBI;
302    }
303  }
304
305  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
306  if (MergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
307               Scratch, dl, Regs)) {
308    Merges.push_back(prior(Loc));
309    for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) {
310      MBB.erase(MemOps[i].MBBI);
311      MemOps[i].Merged = true;
312    }
313  }
314
315  return;
316}
317
318static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
319                                       unsigned Bytes, unsigned Limit,
320                                       ARMCC::CondCodes Pred, unsigned PredReg){
321  unsigned MyPredReg = 0;
322  if (!MI)
323    return false;
324  if (MI->getOpcode() != ARM::t2SUBri &&
325      MI->getOpcode() != ARM::t2SUBrSPi &&
326      MI->getOpcode() != ARM::t2SUBrSPi12 &&
327      MI->getOpcode() != ARM::tSUBspi &&
328      MI->getOpcode() != ARM::SUBri)
329    return false;
330
331  // Make sure the offset fits in 8 bits.
332  if (Bytes <= 0 || (Limit && Bytes >= Limit))
333    return false;
334
335  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
336  return (MI->getOperand(0).getReg() == Base &&
337          MI->getOperand(1).getReg() == Base &&
338          (MI->getOperand(2).getImm()*Scale) == Bytes &&
339          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
340          MyPredReg == PredReg);
341}
342
343static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
344                                       unsigned Bytes, unsigned Limit,
345                                       ARMCC::CondCodes Pred, unsigned PredReg){
346  unsigned MyPredReg = 0;
347  if (!MI)
348    return false;
349  if (MI->getOpcode() != ARM::t2ADDri &&
350      MI->getOpcode() != ARM::t2ADDrSPi &&
351      MI->getOpcode() != ARM::t2ADDrSPi12 &&
352      MI->getOpcode() != ARM::tADDspi &&
353      MI->getOpcode() != ARM::ADDri)
354    return false;
355
356  if (Bytes <= 0 || (Limit && Bytes >= Limit))
357    // Make sure the offset fits in 8 bits.
358    return false;
359
360  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
361  return (MI->getOperand(0).getReg() == Base &&
362          MI->getOperand(1).getReg() == Base &&
363          (MI->getOperand(2).getImm()*Scale) == Bytes &&
364          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
365          MyPredReg == PredReg);
366}
367
368static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
369  switch (MI->getOpcode()) {
370  default: return 0;
371  case ARM::LDR:
372  case ARM::STR:
373  case ARM::t2LDRi8:
374  case ARM::t2LDRi12:
375  case ARM::t2STRi8:
376  case ARM::t2STRi12:
377  case ARM::FLDS:
378  case ARM::FSTS:
379    return 4;
380  case ARM::FLDD:
381  case ARM::FSTD:
382    return 8;
383  case ARM::LDM:
384  case ARM::STM:
385  case ARM::t2LDM:
386  case ARM::t2STM:
387    return (MI->getNumOperands() - 5) * 4;
388  case ARM::FLDMS:
389  case ARM::FSTMS:
390  case ARM::FLDMD:
391  case ARM::FSTMD:
392    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
393  }
394}
395
396/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
397/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
398///
399/// stmia rn, <ra, rb, rc>
400/// rn := rn + 4 * 3;
401/// =>
402/// stmia rn!, <ra, rb, rc>
403///
404/// rn := rn - 4 * 3;
405/// ldmia rn, <ra, rb, rc>
406/// =>
407/// ldmdb rn!, <ra, rb, rc>
408bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
409                                               MachineBasicBlock::iterator MBBI,
410                                               bool &Advance,
411                                               MachineBasicBlock::iterator &I) {
412  MachineInstr *MI = MBBI;
413  unsigned Base = MI->getOperand(0).getReg();
414  unsigned Bytes = getLSMultipleTransferSize(MI);
415  unsigned PredReg = 0;
416  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
417  int Opcode = MI->getOpcode();
418  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
419    Opcode == ARM::STM || Opcode == ARM::t2STM;
420
421  if (isAM4) {
422    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
423      return false;
424
425    // Can't use the updating AM4 sub-mode if the base register is also a dest
426    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
427    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
428      if (MI->getOperand(i).getReg() == Base)
429        return false;
430    }
431
432    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
433    if (MBBI != MBB.begin()) {
434      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
435      if (Mode == ARM_AM::ia &&
436          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
437        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
438        MI->getOperand(4).setReg(Base);
439        MI->getOperand(4).setIsDef();
440        MBB.erase(PrevMBBI);
441        return true;
442      } else if (Mode == ARM_AM::ib &&
443                 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
444        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
445        MI->getOperand(4).setReg(Base);  // WB to base
446        MI->getOperand(4).setIsDef();
447        MBB.erase(PrevMBBI);
448        return true;
449      }
450    }
451
452    if (MBBI != MBB.end()) {
453      MachineBasicBlock::iterator NextMBBI = next(MBBI);
454      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
455          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
456        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
457        MI->getOperand(4).setReg(Base);  // WB to base
458        MI->getOperand(4).setIsDef();
459        if (NextMBBI == I) {
460          Advance = true;
461          ++I;
462        }
463        MBB.erase(NextMBBI);
464        return true;
465      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
466                 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
467        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
468        MI->getOperand(4).setReg(Base);  // WB to base
469        MI->getOperand(4).setIsDef();
470        if (NextMBBI == I) {
471          Advance = true;
472          ++I;
473        }
474        MBB.erase(NextMBBI);
475        return true;
476      }
477    }
478  } else {
479    // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops.
480    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
481      return false;
482
483    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
484    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
485    if (MBBI != MBB.begin()) {
486      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
487      if (Mode == ARM_AM::ia &&
488          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
489        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
490        MI->getOperand(4).setReg(Base);  // WB to base
491        MI->getOperand(4).setIsDef();
492        MBB.erase(PrevMBBI);
493        return true;
494      }
495    }
496
497    if (MBBI != MBB.end()) {
498      MachineBasicBlock::iterator NextMBBI = next(MBBI);
499      if (Mode == ARM_AM::ia &&
500          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
501        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
502        MI->getOperand(4).setReg(Base);  // WB to base
503        MI->getOperand(4).setIsDef();
504        if (NextMBBI == I) {
505          Advance = true;
506          ++I;
507        }
508        MBB.erase(NextMBBI);
509      }
510      return true;
511    }
512  }
513
514  return false;
515}
516
517static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
518  switch (Opc) {
519  case ARM::LDR: return ARM::LDR_PRE;
520  case ARM::STR: return ARM::STR_PRE;
521  case ARM::FLDS: return ARM::FLDMS;
522  case ARM::FLDD: return ARM::FLDMD;
523  case ARM::FSTS: return ARM::FSTMS;
524  case ARM::FSTD: return ARM::FSTMD;
525  case ARM::t2LDRi8:
526  case ARM::t2LDRi12:
527    return ARM::t2LDR_PRE;
528  case ARM::t2STRi8:
529  case ARM::t2STRi12:
530    return ARM::t2STR_PRE;
531  default: llvm_unreachable("Unhandled opcode!");
532  }
533  return 0;
534}
535
536static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
537  switch (Opc) {
538  case ARM::LDR: return ARM::LDR_POST;
539  case ARM::STR: return ARM::STR_POST;
540  case ARM::FLDS: return ARM::FLDMS;
541  case ARM::FLDD: return ARM::FLDMD;
542  case ARM::FSTS: return ARM::FSTMS;
543  case ARM::FSTD: return ARM::FSTMD;
544  case ARM::t2LDRi8:
545  case ARM::t2LDRi12:
546    return ARM::t2LDR_POST;
547  case ARM::t2STRi8:
548  case ARM::t2STRi12:
549    return ARM::t2STR_POST;
550  default: llvm_unreachable("Unhandled opcode!");
551  }
552  return 0;
553}
554
555/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
556/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
557bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
558                                               MachineBasicBlock::iterator MBBI,
559                                               const TargetInstrInfo *TII,
560                                               bool &Advance,
561                                               MachineBasicBlock::iterator &I) {
562  MachineInstr *MI = MBBI;
563  unsigned Base = MI->getOperand(1).getReg();
564  bool BaseKill = MI->getOperand(1).isKill();
565  unsigned Bytes = getLSMultipleTransferSize(MI);
566  int Opcode = MI->getOpcode();
567  DebugLoc dl = MI->getDebugLoc();
568  bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS ||
569    Opcode == ARM::FSTD || Opcode == ARM::FSTS;
570  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
571  if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
572    return false;
573  else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
574    return false;
575  else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
576    if (MI->getOperand(2).getImm() != 0)
577      return false;
578
579  bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
580  // Can't do the merge if the destination register is the same as the would-be
581  // writeback register.
582  if (isLd && MI->getOperand(0).getReg() == Base)
583    return false;
584
585  unsigned PredReg = 0;
586  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
587  bool DoMerge = false;
588  ARM_AM::AddrOpc AddSub = ARM_AM::add;
589  unsigned NewOpc = 0;
590  // AM2 - 12 bits, thumb2 - 8 bits.
591  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
592  if (MBBI != MBB.begin()) {
593    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
594    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
595      DoMerge = true;
596      AddSub = ARM_AM::sub;
597      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
598    } else if (!isAM5 &&
599               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
600      DoMerge = true;
601      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
602    }
603    if (DoMerge)
604      MBB.erase(PrevMBBI);
605  }
606
607  if (!DoMerge && MBBI != MBB.end()) {
608    MachineBasicBlock::iterator NextMBBI = next(MBBI);
609    if (!isAM5 &&
610        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
611      DoMerge = true;
612      AddSub = ARM_AM::sub;
613      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
614    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
615      DoMerge = true;
616      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
617    }
618    if (DoMerge) {
619      if (NextMBBI == I) {
620        Advance = true;
621        ++I;
622      }
623      MBB.erase(NextMBBI);
624    }
625  }
626
627  if (!DoMerge)
628    return false;
629
630  bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
631  unsigned Offset = 0;
632  if (isAM5)
633    Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub)
634                               ? ARM_AM::db
635                               : ARM_AM::ia, true, (isDPR ? 2 : 1));
636  else if (isAM2)
637    Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
638  else
639    Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
640  if (isLd) {
641    if (isAM5)
642      // FLDMS, FLDMD
643      BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
644        .addReg(Base, getKillRegState(BaseKill))
645        .addImm(Offset).addImm(Pred).addReg(PredReg)
646        .addReg(Base, getDefRegState(true)) // WB base register
647        .addReg(MI->getOperand(0).getReg(), RegState::Define);
648    else if (isAM2)
649      // LDR_PRE, LDR_POST,
650      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
651        .addReg(Base, RegState::Define)
652        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
653    else
654      // t2LDR_PRE, t2LDR_POST
655      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
656        .addReg(Base, RegState::Define)
657        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
658  } else {
659    MachineOperand &MO = MI->getOperand(0);
660    if (isAM5)
661      // FSTMS, FSTMD
662      BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
663        .addImm(Pred).addReg(PredReg)
664        .addReg(Base, getDefRegState(true)) // WB base register
665        .addReg(MO.getReg(), getKillRegState(MO.isKill()));
666    else if (isAM2)
667      // STR_PRE, STR_POST
668      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
669        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
670        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
671    else
672      // t2STR_PRE, t2STR_POST
673      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
674        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
675        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
676  }
677  MBB.erase(MBBI);
678
679  return true;
680}
681
682/// isMemoryOp - Returns true if instruction is a memory operations (that this
683/// pass is capable of operating on).
684static bool isMemoryOp(const MachineInstr *MI) {
685  int Opcode = MI->getOpcode();
686  switch (Opcode) {
687  default: break;
688  case ARM::LDR:
689  case ARM::STR:
690    return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
691  case ARM::FLDS:
692  case ARM::FSTS:
693    return MI->getOperand(1).isReg();
694  case ARM::FLDD:
695  case ARM::FSTD:
696    return MI->getOperand(1).isReg();
697  case ARM::t2LDRi8:
698  case ARM::t2LDRi12:
699  case ARM::t2STRi8:
700  case ARM::t2STRi12:
701    return MI->getOperand(1).isReg();
702  }
703  return false;
704}
705
706/// AdvanceRS - Advance register scavenger to just before the earliest memory
707/// op that is being merged.
708void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
709  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
710  unsigned Position = MemOps[0].Position;
711  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
712    if (MemOps[i].Position < Position) {
713      Position = MemOps[i].Position;
714      Loc = MemOps[i].MBBI;
715    }
716  }
717
718  if (Loc != MBB.begin())
719    RS->forward(prior(Loc));
720}
721
722static int getMemoryOpOffset(const MachineInstr *MI) {
723  int Opcode = MI->getOpcode();
724  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
725  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
726  unsigned NumOperands = MI->getDesc().getNumOperands();
727  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
728
729  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
730      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
731      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
732    return OffField;
733
734  int Offset = isAM2
735    ? ARM_AM::getAM2Offset(OffField)
736    : (isAM3 ? ARM_AM::getAM3Offset(OffField)
737             : ARM_AM::getAM5Offset(OffField) * 4);
738  if (isAM2) {
739    if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
740      Offset = -Offset;
741  } else if (isAM3) {
742    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
743      Offset = -Offset;
744  } else {
745    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
746      Offset = -Offset;
747  }
748  return Offset;
749}
750
751static void InsertLDR_STR(MachineBasicBlock &MBB,
752                          MachineBasicBlock::iterator &MBBI,
753                          int OffImm, bool isDef,
754                          DebugLoc dl, unsigned NewOpc,
755                          unsigned Reg, bool RegDeadKill, bool RegUndef,
756                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
757                          unsigned OffReg, bool OffKill, bool OffUndef,
758                          ARMCC::CondCodes Pred, unsigned PredReg,
759                          const TargetInstrInfo *TII, bool isT2) {
760  int Offset = OffImm;
761  if (!isT2) {
762    if (OffImm < 0)
763      Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
764    else
765      Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
766  }
767  if (isDef) {
768    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
769                                      TII->get(NewOpc))
770      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
771      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
772    if (!isT2)
773      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
774    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
775  } else {
776    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
777                                      TII->get(NewOpc))
778      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
779      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
780    if (!isT2)
781      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
782    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
783  }
784}
785
786bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
787                                          MachineBasicBlock::iterator &MBBI) {
788  MachineInstr *MI = &*MBBI;
789  unsigned Opcode = MI->getOpcode();
790  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
791      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
792    unsigned EvenReg = MI->getOperand(0).getReg();
793    unsigned OddReg  = MI->getOperand(1).getReg();
794    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
795    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
796    if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
797      return false;
798
799    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
800    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
801    bool EvenDeadKill = isLd ?
802      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
803    bool EvenUndef = MI->getOperand(0).isUndef();
804    bool OddDeadKill  = isLd ?
805      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
806    bool OddUndef = MI->getOperand(1).isUndef();
807    const MachineOperand &BaseOp = MI->getOperand(2);
808    unsigned BaseReg = BaseOp.getReg();
809    bool BaseKill = BaseOp.isKill();
810    bool BaseUndef = BaseOp.isUndef();
811    unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
812    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
813    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
814    int OffImm = getMemoryOpOffset(MI);
815    unsigned PredReg = 0;
816    ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
817
818    if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
819      // Ascending register numbers and no offset. It's safe to change it to a
820      // ldm or stm.
821      unsigned NewOpc = (isLd)
822        ? (isT2 ? ARM::t2LDM : ARM::LDM)
823        : (isT2 ? ARM::t2STM : ARM::STM);
824      if (isLd) {
825        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
826          .addReg(BaseReg, getKillRegState(BaseKill))
827          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
828          .addImm(Pred).addReg(PredReg)
829          .addReg(0)
830          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
831          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
832        ++NumLDRD2LDM;
833      } else {
834        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
835          .addReg(BaseReg, getKillRegState(BaseKill))
836          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
837          .addImm(Pred).addReg(PredReg)
838          .addReg(0)
839          .addReg(EvenReg,
840                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
841          .addReg(OddReg,
842                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
843        ++NumSTRD2STM;
844      }
845    } else {
846      // Split into two instructions.
847      assert((!isT2 || !OffReg) &&
848             "Thumb2 ldrd / strd does not encode offset register!");
849      unsigned NewOpc = (isLd)
850        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
851        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
852      DebugLoc dl = MBBI->getDebugLoc();
853      // If this is a load and base register is killed, it may have been
854      // re-defed by the load, make sure the first load does not clobber it.
855      if (isLd &&
856          (BaseKill || OffKill) &&
857          (TRI->regsOverlap(EvenReg, BaseReg) ||
858           (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
859        assert(!TRI->regsOverlap(OddReg, BaseReg) &&
860               (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
861        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
862                      OddReg, OddDeadKill, false,
863                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
864                      Pred, PredReg, TII, isT2);
865        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
866                      EvenReg, EvenDeadKill, false,
867                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
868                      Pred, PredReg, TII, isT2);
869      } else {
870        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
871                      EvenReg, EvenDeadKill, EvenUndef,
872                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
873                      Pred, PredReg, TII, isT2);
874        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
875                      OddReg, OddDeadKill, OddUndef,
876                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
877                      Pred, PredReg, TII, isT2);
878      }
879      if (isLd)
880        ++NumLDRD2LDR;
881      else
882        ++NumSTRD2STR;
883    }
884
885    MBBI = prior(MBBI);
886    MBB.erase(MI);
887  }
888  return false;
889}
890
891/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
892/// ops of the same base and incrementing offset into LDM / STM ops.
893bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
894  unsigned NumMerges = 0;
895  unsigned NumMemOps = 0;
896  MemOpQueue MemOps;
897  unsigned CurrBase = 0;
898  int CurrOpc = -1;
899  unsigned CurrSize = 0;
900  ARMCC::CondCodes CurrPred = ARMCC::AL;
901  unsigned CurrPredReg = 0;
902  unsigned Position = 0;
903  SmallVector<MachineBasicBlock::iterator,4> Merges;
904
905  RS->enterBasicBlock(&MBB);
906  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
907  while (MBBI != E) {
908    if (FixInvalidRegPairOp(MBB, MBBI))
909      continue;
910
911    bool Advance  = false;
912    bool TryMerge = false;
913    bool Clobber  = false;
914
915    bool isMemOp = isMemoryOp(MBBI);
916    if (isMemOp) {
917      int Opcode = MBBI->getOpcode();
918      unsigned Size = getLSMultipleTransferSize(MBBI);
919      unsigned Base = MBBI->getOperand(1).getReg();
920      unsigned PredReg = 0;
921      ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
922      int Offset = getMemoryOpOffset(MBBI);
923      // Watch out for:
924      // r4 := ldr [r5]
925      // r5 := ldr [r5, #4]
926      // r6 := ldr [r5, #8]
927      //
928      // The second ldr has effectively broken the chain even though it
929      // looks like the later ldr(s) use the same base register. Try to
930      // merge the ldr's so far, including this one. But don't try to
931      // combine the following ldr(s).
932      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
933      if (CurrBase == 0 && !Clobber) {
934        // Start of a new chain.
935        CurrBase = Base;
936        CurrOpc  = Opcode;
937        CurrSize = Size;
938        CurrPred = Pred;
939        CurrPredReg = PredReg;
940        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
941        NumMemOps++;
942        Advance = true;
943      } else {
944        if (Clobber) {
945          TryMerge = true;
946          Advance = true;
947        }
948
949        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
950          // No need to match PredReg.
951          // Continue adding to the queue.
952          if (Offset > MemOps.back().Offset) {
953            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
954            NumMemOps++;
955            Advance = true;
956          } else {
957            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
958                 I != E; ++I) {
959              if (Offset < I->Offset) {
960                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
961                NumMemOps++;
962                Advance = true;
963                break;
964              } else if (Offset == I->Offset) {
965                // Collision! This can't be merged!
966                break;
967              }
968            }
969          }
970        }
971      }
972    }
973
974    if (Advance) {
975      ++Position;
976      ++MBBI;
977    } else
978      TryMerge = true;
979
980    if (TryMerge) {
981      if (NumMemOps > 1) {
982        // Try to find a free register to use as a new base in case it's needed.
983        // First advance to the instruction just before the start of the chain.
984        AdvanceRS(MBB, MemOps);
985        // Find a scratch register.
986        unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
987        // Process the load / store instructions.
988        RS->forward(prior(MBBI));
989
990        // Merge ops.
991        Merges.clear();
992        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
993                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
994
995        // Try folding preceeding/trailing base inc/dec into the generated
996        // LDM/STM ops.
997        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
998          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
999            ++NumMerges;
1000        NumMerges += Merges.size();
1001
1002        // Try folding preceeding/trailing base inc/dec into those load/store
1003        // that were not merged to form LDM/STM ops.
1004        for (unsigned i = 0; i != NumMemOps; ++i)
1005          if (!MemOps[i].Merged)
1006            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
1007              ++NumMerges;
1008
1009        // RS may be pointing to an instruction that's deleted.
1010        RS->skipTo(prior(MBBI));
1011      } else if (NumMemOps == 1) {
1012        // Try folding preceeding/trailing base inc/dec into the single
1013        // load/store.
1014        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
1015          ++NumMerges;
1016          RS->forward(prior(MBBI));
1017        }
1018      }
1019
1020      CurrBase = 0;
1021      CurrOpc = -1;
1022      CurrSize = 0;
1023      CurrPred = ARMCC::AL;
1024      CurrPredReg = 0;
1025      if (NumMemOps) {
1026        MemOps.clear();
1027        NumMemOps = 0;
1028      }
1029
1030      // If iterator hasn't been advanced and this is not a memory op, skip it.
1031      // It can't start a new chain anyway.
1032      if (!Advance && !isMemOp && MBBI != E) {
1033        ++Position;
1034        ++MBBI;
1035      }
1036    }
1037  }
1038  return NumMerges > 0;
1039}
1040
1041namespace {
1042  struct OffsetCompare {
1043    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
1044      int LOffset = getMemoryOpOffset(LHS);
1045      int ROffset = getMemoryOpOffset(RHS);
1046      assert(LHS == RHS || LOffset != ROffset);
1047      return LOffset > ROffset;
1048    }
1049  };
1050}
1051
1052/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
1053/// (bx lr) into the preceeding stack restore so it directly restore the value
1054/// of LR into pc.
1055///   ldmfd sp!, {r7, lr}
1056///   bx lr
1057/// =>
1058///   ldmfd sp!, {r7, pc}
1059bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1060  if (MBB.empty()) return false;
1061
1062  MachineBasicBlock::iterator MBBI = prior(MBB.end());
1063  if (MBBI != MBB.begin() &&
1064      (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
1065    MachineInstr *PrevMI = prior(MBBI);
1066    if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
1067      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1068      if (MO.getReg() != ARM::LR)
1069        return false;
1070      unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
1071      PrevMI->setDesc(TII->get(NewOpc));
1072      MO.setReg(ARM::PC);
1073      MBB.erase(MBBI);
1074      return true;
1075    }
1076  }
1077  return false;
1078}
1079
1080bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1081  const TargetMachine &TM = Fn.getTarget();
1082  AFI = Fn.getInfo<ARMFunctionInfo>();
1083  TII = TM.getInstrInfo();
1084  TRI = TM.getRegisterInfo();
1085  RS = new RegScavenger();
1086  isThumb2 = AFI->isThumb2Function();
1087
1088  bool Modified = false;
1089  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1090       ++MFI) {
1091    MachineBasicBlock &MBB = *MFI;
1092    Modified |= LoadStoreMultipleOpti(MBB);
1093    Modified |= MergeReturnIntoLDM(MBB);
1094  }
1095
1096  delete RS;
1097  return Modified;
1098}
1099
1100
1101/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1102/// load / stores from consecutive locations close to make it more
1103/// likely they will be combined later.
1104
1105namespace {
1106  struct VISIBILITY_HIDDEN ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1107    static char ID;
1108    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
1109
1110    const TargetData *TD;
1111    const TargetInstrInfo *TII;
1112    const TargetRegisterInfo *TRI;
1113    const ARMSubtarget *STI;
1114    MachineRegisterInfo *MRI;
1115    MachineFunction *MF;
1116
1117    virtual bool runOnMachineFunction(MachineFunction &Fn);
1118
1119    virtual const char *getPassName() const {
1120      return "ARM pre- register allocation load / store optimization pass";
1121    }
1122
1123  private:
1124    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1125                          unsigned &NewOpc, unsigned &EvenReg,
1126                          unsigned &OddReg, unsigned &BaseReg,
1127                          unsigned &OffReg, int &Offset,
1128                          unsigned &PredReg, ARMCC::CondCodes &Pred,
1129                          bool &isT2);
1130    bool RescheduleOps(MachineBasicBlock *MBB,
1131                       SmallVector<MachineInstr*, 4> &Ops,
1132                       unsigned Base, bool isLd,
1133                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1134    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1135  };
1136  char ARMPreAllocLoadStoreOpt::ID = 0;
1137}
1138
1139bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1140  TD  = Fn.getTarget().getTargetData();
1141  TII = Fn.getTarget().getInstrInfo();
1142  TRI = Fn.getTarget().getRegisterInfo();
1143  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1144  MRI = &Fn.getRegInfo();
1145  MF  = &Fn;
1146
1147  bool Modified = false;
1148  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1149       ++MFI)
1150    Modified |= RescheduleLoadStoreInstrs(MFI);
1151
1152  return Modified;
1153}
1154
1155static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1156                                      MachineBasicBlock::iterator I,
1157                                      MachineBasicBlock::iterator E,
1158                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
1159                                      SmallSet<unsigned, 4> &MemRegs,
1160                                      const TargetRegisterInfo *TRI) {
1161  // Are there stores / loads / calls between them?
1162  // FIXME: This is overly conservative. We should make use of alias information
1163  // some day.
1164  SmallSet<unsigned, 4> AddedRegPressure;
1165  while (++I != E) {
1166    if (MemOps.count(&*I))
1167      continue;
1168    const TargetInstrDesc &TID = I->getDesc();
1169    if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
1170      return false;
1171    if (isLd && TID.mayStore())
1172      return false;
1173    if (!isLd) {
1174      if (TID.mayLoad())
1175        return false;
1176      // It's not safe to move the first 'str' down.
1177      // str r1, [r0]
1178      // strh r5, [r0]
1179      // str r4, [r0, #+4]
1180      if (TID.mayStore())
1181        return false;
1182    }
1183    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1184      MachineOperand &MO = I->getOperand(j);
1185      if (!MO.isReg())
1186        continue;
1187      unsigned Reg = MO.getReg();
1188      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1189        return false;
1190      if (Reg != Base && !MemRegs.count(Reg))
1191        AddedRegPressure.insert(Reg);
1192    }
1193  }
1194
1195  // Estimate register pressure increase due to the transformation.
1196  if (MemRegs.size() <= 4)
1197    // Ok if we are moving small number of instructions.
1198    return true;
1199  return AddedRegPressure.size() <= MemRegs.size() * 2;
1200}
1201
1202bool
1203ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1204                                          DebugLoc &dl,
1205                                          unsigned &NewOpc, unsigned &EvenReg,
1206                                          unsigned &OddReg, unsigned &BaseReg,
1207                                          unsigned &OffReg, int &Offset,
1208                                          unsigned &PredReg,
1209                                          ARMCC::CondCodes &Pred,
1210                                          bool &isT2) {
1211  // Make sure we're allowed to generate LDRD/STRD.
1212  if (!STI->hasV5TEOps())
1213    return false;
1214
1215  // FIXME: FLDS / FSTS -> FLDD / FSTD
1216  unsigned Scale = 1;
1217  unsigned Opcode = Op0->getOpcode();
1218  if (Opcode == ARM::LDR)
1219    NewOpc = ARM::LDRD;
1220  else if (Opcode == ARM::STR)
1221    NewOpc = ARM::STRD;
1222  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1223    NewOpc = ARM::t2LDRDi8;
1224    Scale = 4;
1225    isT2 = true;
1226  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1227    NewOpc = ARM::t2STRDi8;
1228    Scale = 4;
1229    isT2 = true;
1230  } else
1231    return false;
1232
1233  // Make sure the offset registers match.
1234  if (!isT2 &&
1235      (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
1236      return false;
1237
1238  // Must sure the base address satisfies i64 ld / st alignment requirement.
1239  if (!Op0->hasOneMemOperand() ||
1240      !(*Op0->memoperands_begin())->getValue() ||
1241      (*Op0->memoperands_begin())->isVolatile())
1242    return false;
1243
1244  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1245  Function *Func = MF->getFunction();
1246  unsigned ReqAlign = STI->hasV6Ops()
1247    ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext()))
1248    : 8;  // Pre-v6 need 8-byte align
1249  if (Align < ReqAlign)
1250    return false;
1251
1252  // Then make sure the immediate offset fits.
1253  int OffImm = getMemoryOpOffset(Op0);
1254  if (isT2) {
1255    if (OffImm < 0) {
1256      if (OffImm < -255)
1257        // Can't fall back to t2LDRi8 / t2STRi8.
1258        return false;
1259    } else {
1260      int Limit = (1 << 8) * Scale;
1261      if (OffImm >= Limit || (OffImm & (Scale-1)))
1262        return false;
1263    }
1264    Offset = OffImm;
1265  } else {
1266    ARM_AM::AddrOpc AddSub = ARM_AM::add;
1267    if (OffImm < 0) {
1268      AddSub = ARM_AM::sub;
1269      OffImm = - OffImm;
1270    }
1271    int Limit = (1 << 8) * Scale;
1272    if (OffImm >= Limit || (OffImm & (Scale-1)))
1273      return false;
1274    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1275  }
1276  EvenReg = Op0->getOperand(0).getReg();
1277  OddReg  = Op1->getOperand(0).getReg();
1278  if (EvenReg == OddReg)
1279    return false;
1280  BaseReg = Op0->getOperand(1).getReg();
1281  if (!isT2)
1282    OffReg = Op0->getOperand(2).getReg();
1283  Pred = llvm::getInstrPredicate(Op0, PredReg);
1284  dl = Op0->getDebugLoc();
1285  return true;
1286}
1287
1288bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1289                                 SmallVector<MachineInstr*, 4> &Ops,
1290                                 unsigned Base, bool isLd,
1291                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1292  bool RetVal = false;
1293
1294  // Sort by offset (in reverse order).
1295  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1296
1297  // The loads / stores of the same base are in order. Scan them from first to
1298  // last and check for the followins:
1299  // 1. Any def of base.
1300  // 2. Any gaps.
1301  while (Ops.size() > 1) {
1302    unsigned FirstLoc = ~0U;
1303    unsigned LastLoc = 0;
1304    MachineInstr *FirstOp = 0;
1305    MachineInstr *LastOp = 0;
1306    int LastOffset = 0;
1307    unsigned LastOpcode = 0;
1308    unsigned LastBytes = 0;
1309    unsigned NumMove = 0;
1310    for (int i = Ops.size() - 1; i >= 0; --i) {
1311      MachineInstr *Op = Ops[i];
1312      unsigned Loc = MI2LocMap[Op];
1313      if (Loc <= FirstLoc) {
1314        FirstLoc = Loc;
1315        FirstOp = Op;
1316      }
1317      if (Loc >= LastLoc) {
1318        LastLoc = Loc;
1319        LastOp = Op;
1320      }
1321
1322      unsigned Opcode = Op->getOpcode();
1323      if (LastOpcode && Opcode != LastOpcode)
1324        break;
1325
1326      int Offset = getMemoryOpOffset(Op);
1327      unsigned Bytes = getLSMultipleTransferSize(Op);
1328      if (LastBytes) {
1329        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1330          break;
1331      }
1332      LastOffset = Offset;
1333      LastBytes = Bytes;
1334      LastOpcode = Opcode;
1335      if (++NumMove == 8) // FIXME: Tune this limit.
1336        break;
1337    }
1338
1339    if (NumMove <= 1)
1340      Ops.pop_back();
1341    else {
1342      SmallPtrSet<MachineInstr*, 4> MemOps;
1343      SmallSet<unsigned, 4> MemRegs;
1344      for (int i = NumMove-1; i >= 0; --i) {
1345        MemOps.insert(Ops[i]);
1346        MemRegs.insert(Ops[i]->getOperand(0).getReg());
1347      }
1348
1349      // Be conservative, if the instructions are too far apart, don't
1350      // move them. We want to limit the increase of register pressure.
1351      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1352      if (DoMove)
1353        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1354                                           MemOps, MemRegs, TRI);
1355      if (!DoMove) {
1356        for (unsigned i = 0; i != NumMove; ++i)
1357          Ops.pop_back();
1358      } else {
1359        // This is the new location for the loads / stores.
1360        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1361        while (InsertPos != MBB->end() && MemOps.count(InsertPos))
1362          ++InsertPos;
1363
1364        // If we are moving a pair of loads / stores, see if it makes sense
1365        // to try to allocate a pair of registers that can form register pairs.
1366        MachineInstr *Op0 = Ops.back();
1367        MachineInstr *Op1 = Ops[Ops.size()-2];
1368        unsigned EvenReg = 0, OddReg = 0;
1369        unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
1370        ARMCC::CondCodes Pred = ARMCC::AL;
1371        bool isT2 = false;
1372        unsigned NewOpc = 0;
1373        int Offset = 0;
1374        DebugLoc dl;
1375        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1376                                             EvenReg, OddReg, BaseReg, OffReg,
1377                                             Offset, PredReg, Pred, isT2)) {
1378          Ops.pop_back();
1379          Ops.pop_back();
1380
1381          // Form the pair instruction.
1382          if (isLd) {
1383            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1384                                              dl, TII->get(NewOpc))
1385              .addReg(EvenReg, RegState::Define)
1386              .addReg(OddReg, RegState::Define)
1387              .addReg(BaseReg);
1388            if (!isT2)
1389              MIB.addReg(OffReg);
1390            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1391            ++NumLDRDFormed;
1392          } else {
1393            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1394                                              dl, TII->get(NewOpc))
1395              .addReg(EvenReg)
1396              .addReg(OddReg)
1397              .addReg(BaseReg);
1398            if (!isT2)
1399              MIB.addReg(OffReg);
1400            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1401            ++NumSTRDFormed;
1402          }
1403          MBB->erase(Op0);
1404          MBB->erase(Op1);
1405
1406          // Add register allocation hints to form register pairs.
1407          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1408          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
1409        } else {
1410          for (unsigned i = 0; i != NumMove; ++i) {
1411            MachineInstr *Op = Ops.back();
1412            Ops.pop_back();
1413            MBB->splice(InsertPos, MBB, Op);
1414          }
1415        }
1416
1417        NumLdStMoved += NumMove;
1418        RetVal = true;
1419      }
1420    }
1421  }
1422
1423  return RetVal;
1424}
1425
1426bool
1427ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1428  bool RetVal = false;
1429
1430  DenseMap<MachineInstr*, unsigned> MI2LocMap;
1431  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1432  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1433  SmallVector<unsigned, 4> LdBases;
1434  SmallVector<unsigned, 4> StBases;
1435
1436  unsigned Loc = 0;
1437  MachineBasicBlock::iterator MBBI = MBB->begin();
1438  MachineBasicBlock::iterator E = MBB->end();
1439  while (MBBI != E) {
1440    for (; MBBI != E; ++MBBI) {
1441      MachineInstr *MI = MBBI;
1442      const TargetInstrDesc &TID = MI->getDesc();
1443      if (TID.isCall() || TID.isTerminator()) {
1444        // Stop at barriers.
1445        ++MBBI;
1446        break;
1447      }
1448
1449      MI2LocMap[MI] = Loc++;
1450      if (!isMemoryOp(MI))
1451        continue;
1452      unsigned PredReg = 0;
1453      if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
1454        continue;
1455
1456      int Opc = MI->getOpcode();
1457      bool isLd = isi32Load(Opc) || Opc == ARM::FLDS || Opc == ARM::FLDD;
1458      unsigned Base = MI->getOperand(1).getReg();
1459      int Offset = getMemoryOpOffset(MI);
1460
1461      bool StopHere = false;
1462      if (isLd) {
1463        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1464          Base2LdsMap.find(Base);
1465        if (BI != Base2LdsMap.end()) {
1466          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1467            if (Offset == getMemoryOpOffset(BI->second[i])) {
1468              StopHere = true;
1469              break;
1470            }
1471          }
1472          if (!StopHere)
1473            BI->second.push_back(MI);
1474        } else {
1475          SmallVector<MachineInstr*, 4> MIs;
1476          MIs.push_back(MI);
1477          Base2LdsMap[Base] = MIs;
1478          LdBases.push_back(Base);
1479        }
1480      } else {
1481        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1482          Base2StsMap.find(Base);
1483        if (BI != Base2StsMap.end()) {
1484          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1485            if (Offset == getMemoryOpOffset(BI->second[i])) {
1486              StopHere = true;
1487              break;
1488            }
1489          }
1490          if (!StopHere)
1491            BI->second.push_back(MI);
1492        } else {
1493          SmallVector<MachineInstr*, 4> MIs;
1494          MIs.push_back(MI);
1495          Base2StsMap[Base] = MIs;
1496          StBases.push_back(Base);
1497        }
1498      }
1499
1500      if (StopHere) {
1501        // Found a duplicate (a base+offset combination that's seen earlier).
1502        // Backtrack.
1503        --Loc;
1504        break;
1505      }
1506    }
1507
1508    // Re-schedule loads.
1509    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1510      unsigned Base = LdBases[i];
1511      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
1512      if (Lds.size() > 1)
1513        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1514    }
1515
1516    // Re-schedule stores.
1517    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1518      unsigned Base = StBases[i];
1519      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
1520      if (Sts.size() > 1)
1521        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1522    }
1523
1524    if (MBBI != E) {
1525      Base2LdsMap.clear();
1526      Base2StsMap.clear();
1527      LdBases.clear();
1528      StBases.clear();
1529    }
1530  }
1531
1532  return RetVal;
1533}
1534
1535
1536/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1537/// optimization pass.
1538FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
1539  if (PreAlloc)
1540    return new ARMPreAllocLoadStoreOpt();
1541  return new ARMLoadStoreOpt();
1542}
1543