ARMLoadStoreOptimizer.cpp revision 3063aed8d29cf2418fc1c3022a3dd9c8de0e4922
1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains a pass that performs load / store related peephole
11// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-ldst-opt"
16#include "ARM.h"
17#include "ARMAddressingModes.h"
18#include "ARMBaseInstrInfo.h"
19#include "ARMMachineFunctionInfo.h"
20#include "ARMRegisterInfo.h"
21#include "llvm/DerivedTypes.h"
22#include "llvm/Function.h"
23#include "llvm/CodeGen/MachineBasicBlock.h"
24#include "llvm/CodeGen/MachineFunctionPass.h"
25#include "llvm/CodeGen/MachineInstr.h"
26#include "llvm/CodeGen/MachineInstrBuilder.h"
27#include "llvm/CodeGen/MachineRegisterInfo.h"
28#include "llvm/CodeGen/RegisterScavenging.h"
29#include "llvm/Target/TargetData.h"
30#include "llvm/Target/TargetInstrInfo.h"
31#include "llvm/Target/TargetMachine.h"
32#include "llvm/Target/TargetRegisterInfo.h"
33#include "llvm/Support/ErrorHandling.h"
34#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/SmallPtrSet.h"
37#include "llvm/ADT/SmallSet.h"
38#include "llvm/ADT/SmallVector.h"
39#include "llvm/ADT/Statistic.h"
40using namespace llvm;
41
42STATISTIC(NumLDMGened , "Number of ldm instructions generated");
43STATISTIC(NumSTMGened , "Number of stm instructions generated");
44STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
45STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
46STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
47STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
48STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
49STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
50STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
51STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
52STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
53
54/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
55/// load / store instructions to form ldm / stm instructions.
56
57namespace {
58  struct ARMLoadStoreOpt : public MachineFunctionPass {
59    static char ID;
60    ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
61
62    const TargetInstrInfo *TII;
63    const TargetRegisterInfo *TRI;
64    ARMFunctionInfo *AFI;
65    RegScavenger *RS;
66    bool isThumb2;
67
68    virtual bool runOnMachineFunction(MachineFunction &Fn);
69
70    virtual const char *getPassName() const {
71      return "ARM load / store optimization pass";
72    }
73
74  private:
75    struct MemOpQueueEntry {
76      int Offset;
77      unsigned Position;
78      MachineBasicBlock::iterator MBBI;
79      bool Merged;
80      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
81        : Offset(o), Position(p), MBBI(i), Merged(false) {}
82    };
83    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
84    typedef MemOpQueue::iterator MemOpQueueIter;
85
86    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
87                  int Offset, unsigned Base, bool BaseKill, int Opcode,
88                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
89                  DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
90    void MergeOpsUpdate(MachineBasicBlock &MBB,
91                        MachineBasicBlock::iterator MBBI,
92                        int Offset,
93                        unsigned Base,
94                        bool BaseKill,
95                        int Opcode,
96                        ARMCC::CondCodes Pred,
97                        unsigned PredReg,
98                        unsigned Scratch,
99                        DebugLoc dl,
100                        MemOpQueue &MemOps,
101                        unsigned memOpsFrom,
102                        unsigned memOpsTo,
103                        SmallVector<MachineBasicBlock::iterator, 4> &Merges);
104    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
105                      int Opcode, unsigned Size,
106                      ARMCC::CondCodes Pred, unsigned PredReg,
107                      unsigned Scratch, MemOpQueue &MemOps,
108                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
109
110    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
111    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
112                             MachineBasicBlock::iterator &MBBI);
113    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
114                                  MachineBasicBlock::iterator MBBI,
115                                  const TargetInstrInfo *TII,
116                                  bool &Advance,
117                                  MachineBasicBlock::iterator &I);
118    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
119                                   MachineBasicBlock::iterator MBBI,
120                                   bool &Advance,
121                                   MachineBasicBlock::iterator &I);
122    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
123    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
124  };
125  char ARMLoadStoreOpt::ID = 0;
126}
127
128static int getLoadStoreMultipleOpcode(int Opcode) {
129  switch (Opcode) {
130  case ARM::LDR:
131    NumLDMGened++;
132    return ARM::LDM;
133  case ARM::STR:
134    NumSTMGened++;
135    return ARM::STM;
136  case ARM::t2LDRi8:
137  case ARM::t2LDRi12:
138    NumLDMGened++;
139    return ARM::t2LDM;
140  case ARM::t2STRi8:
141  case ARM::t2STRi12:
142    NumSTMGened++;
143    return ARM::t2STM;
144  case ARM::VLDRS:
145    NumVLDMGened++;
146    return ARM::VLDMS;
147  case ARM::VSTRS:
148    NumVSTMGened++;
149    return ARM::VSTMS;
150  case ARM::VLDRD:
151    NumVLDMGened++;
152    return ARM::VLDMD;
153  case ARM::VSTRD:
154    NumVSTMGened++;
155    return ARM::VSTMD;
156  default: llvm_unreachable("Unhandled opcode!");
157  }
158  return 0;
159}
160
161static bool isT2i32Load(unsigned Opc) {
162  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
163}
164
165static bool isi32Load(unsigned Opc) {
166  return Opc == ARM::LDR || isT2i32Load(Opc);
167}
168
169static bool isT2i32Store(unsigned Opc) {
170  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
171}
172
173static bool isi32Store(unsigned Opc) {
174  return Opc == ARM::STR || isT2i32Store(Opc);
175}
176
177/// MergeOps - Create and insert a LDM or STM with Base as base register and
178/// registers in Regs as the register operands that would be loaded / stored.
179/// It returns true if the transformation is done.
180bool
181ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
182                          MachineBasicBlock::iterator MBBI,
183                          int Offset, unsigned Base, bool BaseKill,
184                          int Opcode, ARMCC::CondCodes Pred,
185                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
186                          SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
187  // Only a single register to load / store. Don't bother.
188  unsigned NumRegs = Regs.size();
189  if (NumRegs <= 1)
190    return false;
191
192  ARM_AM::AMSubMode Mode = ARM_AM::ia;
193  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
194  if (isAM4 && Offset == 4) {
195    if (isThumb2)
196      // Thumb2 does not support ldmib / stmib.
197      return false;
198    Mode = ARM_AM::ib;
199  } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
200    if (isThumb2)
201      // Thumb2 does not support ldmda / stmda.
202      return false;
203    Mode = ARM_AM::da;
204  } else if (isAM4 && Offset == -4 * (int)NumRegs) {
205    Mode = ARM_AM::db;
206  } else if (Offset != 0) {
207    // If starting offset isn't zero, insert a MI to materialize a new base.
208    // But only do so if it is cost effective, i.e. merging more than two
209    // loads / stores.
210    if (NumRegs <= 2)
211      return false;
212
213    unsigned NewBase;
214    if (isi32Load(Opcode))
215      // If it is a load, then just use one of the destination register to
216      // use as the new base.
217      NewBase = Regs[NumRegs-1].first;
218    else {
219      // Use the scratch register to use as a new base.
220      NewBase = Scratch;
221      if (NewBase == 0)
222        return false;
223    }
224    int BaseOpc = !isThumb2
225      ? ARM::ADDri
226      : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
227    if (Offset < 0) {
228      BaseOpc = !isThumb2
229        ? ARM::SUBri
230        : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
231      Offset = - Offset;
232    }
233    int ImmedOffset = isThumb2
234      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
235    if (ImmedOffset == -1)
236      // FIXME: Try t2ADDri12 or t2SUBri12?
237      return false;  // Probably not worth it then.
238
239    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
240      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
241      .addImm(Pred).addReg(PredReg).addReg(0);
242    Base = NewBase;
243    BaseKill = true;  // New base is always killed right its use.
244  }
245
246  bool isDPR = Opcode == ARM::VLDRD || Opcode == ARM::VSTRD;
247  bool isDef = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
248  Opcode = getLoadStoreMultipleOpcode(Opcode);
249  MachineInstrBuilder MIB = (isAM4)
250    ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
251        .addReg(Base, getKillRegState(BaseKill))
252        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
253    : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
254        .addReg(Base, getKillRegState(BaseKill))
255        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
256        .addImm(Pred).addReg(PredReg);
257  MIB.addReg(0); // Add optional writeback (0 for now).
258  for (unsigned i = 0; i != NumRegs; ++i)
259    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
260                     | getKillRegState(Regs[i].second));
261
262  return true;
263}
264
265// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
266// success.
267void ARMLoadStoreOpt::
268MergeOpsUpdate(MachineBasicBlock &MBB,
269               MachineBasicBlock::iterator MBBI,
270               int Offset,
271               unsigned Base,
272               bool BaseKill,
273               int Opcode,
274               ARMCC::CondCodes Pred,
275               unsigned PredReg,
276               unsigned Scratch,
277               DebugLoc dl,
278               MemOpQueue &MemOps,
279               unsigned memOpsFrom,
280               unsigned memOpsTo,
281               SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
282  // First calculate which of the registers should be killed by the merged
283  // instruction.
284  SmallVector<std::pair<unsigned, bool>, 8> Regs;
285  for (unsigned i = memOpsFrom; i < memOpsTo; ++i) {
286    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
287    Regs.push_back(std::make_pair(MO.getReg(), MO.isKill()));
288  }
289
290  if (!MergeOps(MBB, MBBI, Offset, Base, BaseKill, Opcode,
291                Pred, PredReg, Scratch, dl, Regs))
292    return;
293
294  // Merge succeeded, update records.
295  Merges.push_back(prior(MBBI));
296  for (unsigned i = memOpsFrom; i < memOpsTo; ++i) {
297    MBB.erase(MemOps[i].MBBI);
298    MemOps[i].Merged = true;
299  }
300}
301
302/// MergeLDR_STR - Merge a number of load / store instructions into one or more
303/// load / store multiple instructions.
304void
305ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
306                          unsigned Base, int Opcode, unsigned Size,
307                          ARMCC::CondCodes Pred, unsigned PredReg,
308                          unsigned Scratch, MemOpQueue &MemOps,
309                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
310  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
311  int Offset = MemOps[SIndex].Offset;
312  int SOffset = Offset;
313  unsigned Pos = MemOps[SIndex].Position;
314  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
315  DebugLoc dl = Loc->getDebugLoc();
316  unsigned PReg = Loc->getOperand(0).getReg();
317  unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
318
319  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
320    int NewOffset = MemOps[i].Offset;
321    unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg();
322    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
323    // AM4 - register numbers in ascending order.
324    // AM5 - consecutive register numbers in ascending order.
325    if (NewOffset == Offset + (int)Size &&
326        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
327      Offset += Size;
328      PRegNum = RegNum;
329    } else {
330      // Can't merge this in. Try merge the earlier ones first.
331      MergeOpsUpdate(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
332                     Scratch, dl, MemOps, SIndex, i, Merges);
333      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
334                   MemOps, Merges);
335      return;
336    }
337
338    if (MemOps[i].Position > Pos) {
339      Pos = MemOps[i].Position;
340      Loc = MemOps[i].MBBI;
341    }
342  }
343
344  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
345  MergeOpsUpdate(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
346                 Scratch, dl, MemOps, SIndex, MemOps.size(), Merges);
347  return;
348}
349
350static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
351                                       unsigned Bytes, unsigned Limit,
352                                       ARMCC::CondCodes Pred, unsigned PredReg){
353  unsigned MyPredReg = 0;
354  if (!MI)
355    return false;
356  if (MI->getOpcode() != ARM::t2SUBri &&
357      MI->getOpcode() != ARM::t2SUBrSPi &&
358      MI->getOpcode() != ARM::t2SUBrSPi12 &&
359      MI->getOpcode() != ARM::tSUBspi &&
360      MI->getOpcode() != ARM::SUBri)
361    return false;
362
363  // Make sure the offset fits in 8 bits.
364  if (Bytes <= 0 || (Limit && Bytes >= Limit))
365    return false;
366
367  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
368  return (MI->getOperand(0).getReg() == Base &&
369          MI->getOperand(1).getReg() == Base &&
370          (MI->getOperand(2).getImm()*Scale) == Bytes &&
371          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
372          MyPredReg == PredReg);
373}
374
375static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
376                                       unsigned Bytes, unsigned Limit,
377                                       ARMCC::CondCodes Pred, unsigned PredReg){
378  unsigned MyPredReg = 0;
379  if (!MI)
380    return false;
381  if (MI->getOpcode() != ARM::t2ADDri &&
382      MI->getOpcode() != ARM::t2ADDrSPi &&
383      MI->getOpcode() != ARM::t2ADDrSPi12 &&
384      MI->getOpcode() != ARM::tADDspi &&
385      MI->getOpcode() != ARM::ADDri)
386    return false;
387
388  if (Bytes <= 0 || (Limit && Bytes >= Limit))
389    // Make sure the offset fits in 8 bits.
390    return false;
391
392  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
393  return (MI->getOperand(0).getReg() == Base &&
394          MI->getOperand(1).getReg() == Base &&
395          (MI->getOperand(2).getImm()*Scale) == Bytes &&
396          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
397          MyPredReg == PredReg);
398}
399
400static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
401  switch (MI->getOpcode()) {
402  default: return 0;
403  case ARM::LDR:
404  case ARM::STR:
405  case ARM::t2LDRi8:
406  case ARM::t2LDRi12:
407  case ARM::t2STRi8:
408  case ARM::t2STRi12:
409  case ARM::VLDRS:
410  case ARM::VSTRS:
411    return 4;
412  case ARM::VLDRD:
413  case ARM::VSTRD:
414    return 8;
415  case ARM::LDM:
416  case ARM::STM:
417  case ARM::t2LDM:
418  case ARM::t2STM:
419    return (MI->getNumOperands() - 5) * 4;
420  case ARM::VLDMS:
421  case ARM::VSTMS:
422  case ARM::VLDMD:
423  case ARM::VSTMD:
424    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
425  }
426}
427
428/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
429/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
430///
431/// stmia rn, <ra, rb, rc>
432/// rn := rn + 4 * 3;
433/// =>
434/// stmia rn!, <ra, rb, rc>
435///
436/// rn := rn - 4 * 3;
437/// ldmia rn, <ra, rb, rc>
438/// =>
439/// ldmdb rn!, <ra, rb, rc>
440bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
441                                               MachineBasicBlock::iterator MBBI,
442                                               bool &Advance,
443                                               MachineBasicBlock::iterator &I) {
444  MachineInstr *MI = MBBI;
445  unsigned Base = MI->getOperand(0).getReg();
446  unsigned Bytes = getLSMultipleTransferSize(MI);
447  unsigned PredReg = 0;
448  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
449  int Opcode = MI->getOpcode();
450  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
451    Opcode == ARM::STM || Opcode == ARM::t2STM;
452
453  if (isAM4) {
454    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
455      return false;
456
457    // Can't use the updating AM4 sub-mode if the base register is also a dest
458    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
459    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
460      if (MI->getOperand(i).getReg() == Base)
461        return false;
462    }
463
464    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
465    if (MBBI != MBB.begin()) {
466      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
467      if (Mode == ARM_AM::ia &&
468          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
469        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
470        MI->getOperand(4).setReg(Base);
471        MI->getOperand(4).setIsDef();
472        MBB.erase(PrevMBBI);
473        return true;
474      } else if (Mode == ARM_AM::ib &&
475                 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
476        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
477        MI->getOperand(4).setReg(Base);  // WB to base
478        MI->getOperand(4).setIsDef();
479        MBB.erase(PrevMBBI);
480        return true;
481      }
482    }
483
484    if (MBBI != MBB.end()) {
485      MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
486      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
487          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
488        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
489        MI->getOperand(4).setReg(Base);  // WB to base
490        MI->getOperand(4).setIsDef();
491        if (NextMBBI == I) {
492          Advance = true;
493          ++I;
494        }
495        MBB.erase(NextMBBI);
496        return true;
497      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
498                 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
499        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
500        MI->getOperand(4).setReg(Base);  // WB to base
501        MI->getOperand(4).setIsDef();
502        if (NextMBBI == I) {
503          Advance = true;
504          ++I;
505        }
506        MBB.erase(NextMBBI);
507        return true;
508      }
509    }
510  } else {
511    // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops.
512    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
513      return false;
514
515    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
516    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
517    if (MBBI != MBB.begin()) {
518      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
519      if (Mode == ARM_AM::ia &&
520          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
521        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
522        MI->getOperand(4).setReg(Base);  // WB to base
523        MI->getOperand(4).setIsDef();
524        MBB.erase(PrevMBBI);
525        return true;
526      }
527    }
528
529    if (MBBI != MBB.end()) {
530      MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
531      if (Mode == ARM_AM::ia &&
532          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
533        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
534        MI->getOperand(4).setReg(Base);  // WB to base
535        MI->getOperand(4).setIsDef();
536        if (NextMBBI == I) {
537          Advance = true;
538          ++I;
539        }
540        MBB.erase(NextMBBI);
541      }
542      return true;
543    }
544  }
545
546  return false;
547}
548
549static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
550  switch (Opc) {
551  case ARM::LDR: return ARM::LDR_PRE;
552  case ARM::STR: return ARM::STR_PRE;
553  case ARM::VLDRS: return ARM::VLDMS;
554  case ARM::VLDRD: return ARM::VLDMD;
555  case ARM::VSTRS: return ARM::VSTMS;
556  case ARM::VSTRD: return ARM::VSTMD;
557  case ARM::t2LDRi8:
558  case ARM::t2LDRi12:
559    return ARM::t2LDR_PRE;
560  case ARM::t2STRi8:
561  case ARM::t2STRi12:
562    return ARM::t2STR_PRE;
563  default: llvm_unreachable("Unhandled opcode!");
564  }
565  return 0;
566}
567
568static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
569  switch (Opc) {
570  case ARM::LDR: return ARM::LDR_POST;
571  case ARM::STR: return ARM::STR_POST;
572  case ARM::VLDRS: return ARM::VLDMS;
573  case ARM::VLDRD: return ARM::VLDMD;
574  case ARM::VSTRS: return ARM::VSTMS;
575  case ARM::VSTRD: return ARM::VSTMD;
576  case ARM::t2LDRi8:
577  case ARM::t2LDRi12:
578    return ARM::t2LDR_POST;
579  case ARM::t2STRi8:
580  case ARM::t2STRi12:
581    return ARM::t2STR_POST;
582  default: llvm_unreachable("Unhandled opcode!");
583  }
584  return 0;
585}
586
587/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
588/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
589bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
590                                               MachineBasicBlock::iterator MBBI,
591                                               const TargetInstrInfo *TII,
592                                               bool &Advance,
593                                               MachineBasicBlock::iterator &I) {
594  MachineInstr *MI = MBBI;
595  unsigned Base = MI->getOperand(1).getReg();
596  bool BaseKill = MI->getOperand(1).isKill();
597  unsigned Bytes = getLSMultipleTransferSize(MI);
598  int Opcode = MI->getOpcode();
599  DebugLoc dl = MI->getDebugLoc();
600  bool isAM5 = Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
601    Opcode == ARM::VSTRD || Opcode == ARM::VSTRS;
602  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
603  if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
604    return false;
605  else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
606    return false;
607  else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
608    if (MI->getOperand(2).getImm() != 0)
609      return false;
610
611  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
612  // Can't do the merge if the destination register is the same as the would-be
613  // writeback register.
614  if (isLd && MI->getOperand(0).getReg() == Base)
615    return false;
616
617  unsigned PredReg = 0;
618  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
619  bool DoMerge = false;
620  ARM_AM::AddrOpc AddSub = ARM_AM::add;
621  unsigned NewOpc = 0;
622  // AM2 - 12 bits, thumb2 - 8 bits.
623  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
624  if (MBBI != MBB.begin()) {
625    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
626    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
627      DoMerge = true;
628      AddSub = ARM_AM::sub;
629      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
630    } else if (!isAM5 &&
631               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
632      DoMerge = true;
633      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
634    }
635    if (DoMerge)
636      MBB.erase(PrevMBBI);
637  }
638
639  if (!DoMerge && MBBI != MBB.end()) {
640    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
641    if (!isAM5 &&
642        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
643      DoMerge = true;
644      AddSub = ARM_AM::sub;
645      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
646    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
647      DoMerge = true;
648      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
649    }
650    if (DoMerge) {
651      if (NextMBBI == I) {
652        Advance = true;
653        ++I;
654      }
655      MBB.erase(NextMBBI);
656    }
657  }
658
659  if (!DoMerge)
660    return false;
661
662  bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD;
663  unsigned Offset = 0;
664  if (isAM5)
665    Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub)
666                               ? ARM_AM::db
667                               : ARM_AM::ia, true, (isDPR ? 2 : 1));
668  else if (isAM2)
669    Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
670  else
671    Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
672  if (isLd) {
673    if (isAM5)
674      // VLDMS, VLDMD
675      BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
676        .addReg(Base, getKillRegState(BaseKill))
677        .addImm(Offset).addImm(Pred).addReg(PredReg)
678        .addReg(Base, getDefRegState(true)) // WB base register
679        .addReg(MI->getOperand(0).getReg(), RegState::Define);
680    else if (isAM2)
681      // LDR_PRE, LDR_POST,
682      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
683        .addReg(Base, RegState::Define)
684        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
685    else
686      // t2LDR_PRE, t2LDR_POST
687      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
688        .addReg(Base, RegState::Define)
689        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
690  } else {
691    MachineOperand &MO = MI->getOperand(0);
692    if (isAM5)
693      // VSTMS, VSTMD
694      BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
695        .addImm(Pred).addReg(PredReg)
696        .addReg(Base, getDefRegState(true)) // WB base register
697        .addReg(MO.getReg(), getKillRegState(MO.isKill()));
698    else if (isAM2)
699      // STR_PRE, STR_POST
700      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
701        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
702        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
703    else
704      // t2STR_PRE, t2STR_POST
705      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
706        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
707        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
708  }
709  MBB.erase(MBBI);
710
711  return true;
712}
713
714/// isMemoryOp - Returns true if instruction is a memory operations (that this
715/// pass is capable of operating on).
716static bool isMemoryOp(const MachineInstr *MI) {
717  int Opcode = MI->getOpcode();
718  switch (Opcode) {
719  default: break;
720  case ARM::LDR:
721  case ARM::STR:
722    return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
723  case ARM::VLDRS:
724  case ARM::VSTRS:
725    return MI->getOperand(1).isReg();
726  case ARM::VLDRD:
727  case ARM::VSTRD:
728    return MI->getOperand(1).isReg();
729  case ARM::t2LDRi8:
730  case ARM::t2LDRi12:
731  case ARM::t2STRi8:
732  case ARM::t2STRi12:
733    return MI->getOperand(1).isReg();
734  }
735  return false;
736}
737
738/// AdvanceRS - Advance register scavenger to just before the earliest memory
739/// op that is being merged.
740void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
741  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
742  unsigned Position = MemOps[0].Position;
743  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
744    if (MemOps[i].Position < Position) {
745      Position = MemOps[i].Position;
746      Loc = MemOps[i].MBBI;
747    }
748  }
749
750  if (Loc != MBB.begin())
751    RS->forward(prior(Loc));
752}
753
754static int getMemoryOpOffset(const MachineInstr *MI) {
755  int Opcode = MI->getOpcode();
756  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
757  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
758  unsigned NumOperands = MI->getDesc().getNumOperands();
759  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
760
761  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
762      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
763      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
764    return OffField;
765
766  int Offset = isAM2
767    ? ARM_AM::getAM2Offset(OffField)
768    : (isAM3 ? ARM_AM::getAM3Offset(OffField)
769             : ARM_AM::getAM5Offset(OffField) * 4);
770  if (isAM2) {
771    if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
772      Offset = -Offset;
773  } else if (isAM3) {
774    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
775      Offset = -Offset;
776  } else {
777    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
778      Offset = -Offset;
779  }
780  return Offset;
781}
782
783static void InsertLDR_STR(MachineBasicBlock &MBB,
784                          MachineBasicBlock::iterator &MBBI,
785                          int OffImm, bool isDef,
786                          DebugLoc dl, unsigned NewOpc,
787                          unsigned Reg, bool RegDeadKill, bool RegUndef,
788                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
789                          unsigned OffReg, bool OffKill, bool OffUndef,
790                          ARMCC::CondCodes Pred, unsigned PredReg,
791                          const TargetInstrInfo *TII, bool isT2) {
792  int Offset = OffImm;
793  if (!isT2) {
794    if (OffImm < 0)
795      Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
796    else
797      Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
798  }
799  if (isDef) {
800    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
801                                      TII->get(NewOpc))
802      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
803      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
804    if (!isT2)
805      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
806    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
807  } else {
808    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
809                                      TII->get(NewOpc))
810      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
811      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
812    if (!isT2)
813      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
814    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
815  }
816}
817
818bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
819                                          MachineBasicBlock::iterator &MBBI) {
820  MachineInstr *MI = &*MBBI;
821  unsigned Opcode = MI->getOpcode();
822  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
823      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
824    unsigned EvenReg = MI->getOperand(0).getReg();
825    unsigned OddReg  = MI->getOperand(1).getReg();
826    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
827    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
828    if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
829      return false;
830
831    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
832    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
833    bool EvenDeadKill = isLd ?
834      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
835    bool EvenUndef = MI->getOperand(0).isUndef();
836    bool OddDeadKill  = isLd ?
837      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
838    bool OddUndef = MI->getOperand(1).isUndef();
839    const MachineOperand &BaseOp = MI->getOperand(2);
840    unsigned BaseReg = BaseOp.getReg();
841    bool BaseKill = BaseOp.isKill();
842    bool BaseUndef = BaseOp.isUndef();
843    unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
844    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
845    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
846    int OffImm = getMemoryOpOffset(MI);
847    unsigned PredReg = 0;
848    ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
849
850    if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
851      // Ascending register numbers and no offset. It's safe to change it to a
852      // ldm or stm.
853      unsigned NewOpc = (isLd)
854        ? (isT2 ? ARM::t2LDM : ARM::LDM)
855        : (isT2 ? ARM::t2STM : ARM::STM);
856      if (isLd) {
857        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
858          .addReg(BaseReg, getKillRegState(BaseKill))
859          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
860          .addImm(Pred).addReg(PredReg)
861          .addReg(0)
862          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
863          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
864        ++NumLDRD2LDM;
865      } else {
866        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
867          .addReg(BaseReg, getKillRegState(BaseKill))
868          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
869          .addImm(Pred).addReg(PredReg)
870          .addReg(0)
871          .addReg(EvenReg,
872                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
873          .addReg(OddReg,
874                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
875        ++NumSTRD2STM;
876      }
877    } else {
878      // Split into two instructions.
879      assert((!isT2 || !OffReg) &&
880             "Thumb2 ldrd / strd does not encode offset register!");
881      unsigned NewOpc = (isLd)
882        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
883        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
884      DebugLoc dl = MBBI->getDebugLoc();
885      // If this is a load and base register is killed, it may have been
886      // re-defed by the load, make sure the first load does not clobber it.
887      if (isLd &&
888          (BaseKill || OffKill) &&
889          (TRI->regsOverlap(EvenReg, BaseReg) ||
890           (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
891        assert(!TRI->regsOverlap(OddReg, BaseReg) &&
892               (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
893        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
894                      OddReg, OddDeadKill, false,
895                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
896                      Pred, PredReg, TII, isT2);
897        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
898                      EvenReg, EvenDeadKill, false,
899                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
900                      Pred, PredReg, TII, isT2);
901      } else {
902        if (OddReg == EvenReg && EvenDeadKill) {
903          // If the two source operands are the same, the kill marker is probably
904          // on the first one. e.g.
905          // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
906          EvenDeadKill = false;
907          OddDeadKill = true;
908        }
909        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
910                      EvenReg, EvenDeadKill, EvenUndef,
911                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
912                      Pred, PredReg, TII, isT2);
913        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
914                      OddReg, OddDeadKill, OddUndef,
915                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
916                      Pred, PredReg, TII, isT2);
917      }
918      if (isLd)
919        ++NumLDRD2LDR;
920      else
921        ++NumSTRD2STR;
922    }
923
924    MBBI = prior(MBBI);
925    MBB.erase(MI);
926  }
927  return false;
928}
929
930/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
931/// ops of the same base and incrementing offset into LDM / STM ops.
932bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
933  unsigned NumMerges = 0;
934  unsigned NumMemOps = 0;
935  MemOpQueue MemOps;
936  unsigned CurrBase = 0;
937  int CurrOpc = -1;
938  unsigned CurrSize = 0;
939  ARMCC::CondCodes CurrPred = ARMCC::AL;
940  unsigned CurrPredReg = 0;
941  unsigned Position = 0;
942  SmallVector<MachineBasicBlock::iterator,4> Merges;
943
944  RS->enterBasicBlock(&MBB);
945  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
946  while (MBBI != E) {
947    if (FixInvalidRegPairOp(MBB, MBBI))
948      continue;
949
950    bool Advance  = false;
951    bool TryMerge = false;
952    bool Clobber  = false;
953
954    bool isMemOp = isMemoryOp(MBBI);
955    if (isMemOp) {
956      int Opcode = MBBI->getOpcode();
957      unsigned Size = getLSMultipleTransferSize(MBBI);
958      unsigned Base = MBBI->getOperand(1).getReg();
959      unsigned PredReg = 0;
960      ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
961      int Offset = getMemoryOpOffset(MBBI);
962      // Watch out for:
963      // r4 := ldr [r5]
964      // r5 := ldr [r5, #4]
965      // r6 := ldr [r5, #8]
966      //
967      // The second ldr has effectively broken the chain even though it
968      // looks like the later ldr(s) use the same base register. Try to
969      // merge the ldr's so far, including this one. But don't try to
970      // combine the following ldr(s).
971      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
972      if (CurrBase == 0 && !Clobber) {
973        // Start of a new chain.
974        CurrBase = Base;
975        CurrOpc  = Opcode;
976        CurrSize = Size;
977        CurrPred = Pred;
978        CurrPredReg = PredReg;
979        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
980        NumMemOps++;
981        Advance = true;
982      } else {
983        if (Clobber) {
984          TryMerge = true;
985          Advance = true;
986        }
987
988        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
989          // No need to match PredReg.
990          // Continue adding to the queue.
991          if (Offset > MemOps.back().Offset) {
992            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
993            NumMemOps++;
994            Advance = true;
995          } else {
996            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
997                 I != E; ++I) {
998              if (Offset < I->Offset) {
999                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
1000                NumMemOps++;
1001                Advance = true;
1002                break;
1003              } else if (Offset == I->Offset) {
1004                // Collision! This can't be merged!
1005                break;
1006              }
1007            }
1008          }
1009        }
1010      }
1011    }
1012
1013    if (Advance) {
1014      ++Position;
1015      ++MBBI;
1016      if (MBBI == E)
1017        // Reach the end of the block, try merging the memory instructions.
1018        TryMerge = true;
1019    } else
1020      TryMerge = true;
1021
1022    if (TryMerge) {
1023      if (NumMemOps > 1) {
1024        // Try to find a free register to use as a new base in case it's needed.
1025        // First advance to the instruction just before the start of the chain.
1026        AdvanceRS(MBB, MemOps);
1027        // Find a scratch register.
1028        unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
1029        // Process the load / store instructions.
1030        RS->forward(prior(MBBI));
1031
1032        // Merge ops.
1033        Merges.clear();
1034        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
1035                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
1036
1037        // Try folding preceeding/trailing base inc/dec into the generated
1038        // LDM/STM ops.
1039        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
1040          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
1041            ++NumMerges;
1042        NumMerges += Merges.size();
1043
1044        // Try folding preceeding/trailing base inc/dec into those load/store
1045        // that were not merged to form LDM/STM ops.
1046        for (unsigned i = 0; i != NumMemOps; ++i)
1047          if (!MemOps[i].Merged)
1048            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
1049              ++NumMerges;
1050
1051        // RS may be pointing to an instruction that's deleted.
1052        RS->skipTo(prior(MBBI));
1053      } else if (NumMemOps == 1) {
1054        // Try folding preceeding/trailing base inc/dec into the single
1055        // load/store.
1056        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
1057          ++NumMerges;
1058          RS->forward(prior(MBBI));
1059        }
1060      }
1061
1062      CurrBase = 0;
1063      CurrOpc = -1;
1064      CurrSize = 0;
1065      CurrPred = ARMCC::AL;
1066      CurrPredReg = 0;
1067      if (NumMemOps) {
1068        MemOps.clear();
1069        NumMemOps = 0;
1070      }
1071
1072      // If iterator hasn't been advanced and this is not a memory op, skip it.
1073      // It can't start a new chain anyway.
1074      if (!Advance && !isMemOp && MBBI != E) {
1075        ++Position;
1076        ++MBBI;
1077      }
1078    }
1079  }
1080  return NumMerges > 0;
1081}
1082
1083namespace {
1084  struct OffsetCompare {
1085    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
1086      int LOffset = getMemoryOpOffset(LHS);
1087      int ROffset = getMemoryOpOffset(RHS);
1088      assert(LHS == RHS || LOffset != ROffset);
1089      return LOffset > ROffset;
1090    }
1091  };
1092}
1093
1094/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
1095/// (bx lr) into the preceeding stack restore so it directly restore the value
1096/// of LR into pc.
1097///   ldmfd sp!, {r7, lr}
1098///   bx lr
1099/// =>
1100///   ldmfd sp!, {r7, pc}
1101bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1102  if (MBB.empty()) return false;
1103
1104  MachineBasicBlock::iterator MBBI = prior(MBB.end());
1105  if (MBBI != MBB.begin() &&
1106      (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
1107    MachineInstr *PrevMI = prior(MBBI);
1108    if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
1109      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1110      if (MO.getReg() != ARM::LR)
1111        return false;
1112      unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
1113      PrevMI->setDesc(TII->get(NewOpc));
1114      MO.setReg(ARM::PC);
1115      MBB.erase(MBBI);
1116      return true;
1117    }
1118  }
1119  return false;
1120}
1121
1122bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1123  const TargetMachine &TM = Fn.getTarget();
1124  AFI = Fn.getInfo<ARMFunctionInfo>();
1125  TII = TM.getInstrInfo();
1126  TRI = TM.getRegisterInfo();
1127  RS = new RegScavenger();
1128  isThumb2 = AFI->isThumb2Function();
1129
1130  bool Modified = false;
1131  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1132       ++MFI) {
1133    MachineBasicBlock &MBB = *MFI;
1134    Modified |= LoadStoreMultipleOpti(MBB);
1135    Modified |= MergeReturnIntoLDM(MBB);
1136  }
1137
1138  delete RS;
1139  return Modified;
1140}
1141
1142
1143/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1144/// load / stores from consecutive locations close to make it more
1145/// likely they will be combined later.
1146
1147namespace {
1148  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1149    static char ID;
1150    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
1151
1152    const TargetData *TD;
1153    const TargetInstrInfo *TII;
1154    const TargetRegisterInfo *TRI;
1155    const ARMSubtarget *STI;
1156    MachineRegisterInfo *MRI;
1157    MachineFunction *MF;
1158
1159    virtual bool runOnMachineFunction(MachineFunction &Fn);
1160
1161    virtual const char *getPassName() const {
1162      return "ARM pre- register allocation load / store optimization pass";
1163    }
1164
1165  private:
1166    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1167                          unsigned &NewOpc, unsigned &EvenReg,
1168                          unsigned &OddReg, unsigned &BaseReg,
1169                          unsigned &OffReg, int &Offset,
1170                          unsigned &PredReg, ARMCC::CondCodes &Pred,
1171                          bool &isT2);
1172    bool RescheduleOps(MachineBasicBlock *MBB,
1173                       SmallVector<MachineInstr*, 4> &Ops,
1174                       unsigned Base, bool isLd,
1175                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1176    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1177  };
1178  char ARMPreAllocLoadStoreOpt::ID = 0;
1179}
1180
1181bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1182  TD  = Fn.getTarget().getTargetData();
1183  TII = Fn.getTarget().getInstrInfo();
1184  TRI = Fn.getTarget().getRegisterInfo();
1185  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1186  MRI = &Fn.getRegInfo();
1187  MF  = &Fn;
1188
1189  bool Modified = false;
1190  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1191       ++MFI)
1192    Modified |= RescheduleLoadStoreInstrs(MFI);
1193
1194  return Modified;
1195}
1196
1197static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1198                                      MachineBasicBlock::iterator I,
1199                                      MachineBasicBlock::iterator E,
1200                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
1201                                      SmallSet<unsigned, 4> &MemRegs,
1202                                      const TargetRegisterInfo *TRI) {
1203  // Are there stores / loads / calls between them?
1204  // FIXME: This is overly conservative. We should make use of alias information
1205  // some day.
1206  SmallSet<unsigned, 4> AddedRegPressure;
1207  while (++I != E) {
1208    if (MemOps.count(&*I))
1209      continue;
1210    const TargetInstrDesc &TID = I->getDesc();
1211    if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
1212      return false;
1213    if (isLd && TID.mayStore())
1214      return false;
1215    if (!isLd) {
1216      if (TID.mayLoad())
1217        return false;
1218      // It's not safe to move the first 'str' down.
1219      // str r1, [r0]
1220      // strh r5, [r0]
1221      // str r4, [r0, #+4]
1222      if (TID.mayStore())
1223        return false;
1224    }
1225    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1226      MachineOperand &MO = I->getOperand(j);
1227      if (!MO.isReg())
1228        continue;
1229      unsigned Reg = MO.getReg();
1230      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1231        return false;
1232      if (Reg != Base && !MemRegs.count(Reg))
1233        AddedRegPressure.insert(Reg);
1234    }
1235  }
1236
1237  // Estimate register pressure increase due to the transformation.
1238  if (MemRegs.size() <= 4)
1239    // Ok if we are moving small number of instructions.
1240    return true;
1241  return AddedRegPressure.size() <= MemRegs.size() * 2;
1242}
1243
1244bool
1245ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1246                                          DebugLoc &dl,
1247                                          unsigned &NewOpc, unsigned &EvenReg,
1248                                          unsigned &OddReg, unsigned &BaseReg,
1249                                          unsigned &OffReg, int &Offset,
1250                                          unsigned &PredReg,
1251                                          ARMCC::CondCodes &Pred,
1252                                          bool &isT2) {
1253  // Make sure we're allowed to generate LDRD/STRD.
1254  if (!STI->hasV5TEOps())
1255    return false;
1256
1257  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1258  unsigned Scale = 1;
1259  unsigned Opcode = Op0->getOpcode();
1260  if (Opcode == ARM::LDR)
1261    NewOpc = ARM::LDRD;
1262  else if (Opcode == ARM::STR)
1263    NewOpc = ARM::STRD;
1264  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1265    NewOpc = ARM::t2LDRDi8;
1266    Scale = 4;
1267    isT2 = true;
1268  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1269    NewOpc = ARM::t2STRDi8;
1270    Scale = 4;
1271    isT2 = true;
1272  } else
1273    return false;
1274
1275  // Make sure the offset registers match.
1276  if (!isT2 &&
1277      (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
1278      return false;
1279
1280  // Must sure the base address satisfies i64 ld / st alignment requirement.
1281  if (!Op0->hasOneMemOperand() ||
1282      !(*Op0->memoperands_begin())->getValue() ||
1283      (*Op0->memoperands_begin())->isVolatile())
1284    return false;
1285
1286  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1287  Function *Func = MF->getFunction();
1288  unsigned ReqAlign = STI->hasV6Ops()
1289    ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext()))
1290    : 8;  // Pre-v6 need 8-byte align
1291  if (Align < ReqAlign)
1292    return false;
1293
1294  // Then make sure the immediate offset fits.
1295  int OffImm = getMemoryOpOffset(Op0);
1296  if (isT2) {
1297    if (OffImm < 0) {
1298      if (OffImm < -255)
1299        // Can't fall back to t2LDRi8 / t2STRi8.
1300        return false;
1301    } else {
1302      int Limit = (1 << 8) * Scale;
1303      if (OffImm >= Limit || (OffImm & (Scale-1)))
1304        return false;
1305    }
1306    Offset = OffImm;
1307  } else {
1308    ARM_AM::AddrOpc AddSub = ARM_AM::add;
1309    if (OffImm < 0) {
1310      AddSub = ARM_AM::sub;
1311      OffImm = - OffImm;
1312    }
1313    int Limit = (1 << 8) * Scale;
1314    if (OffImm >= Limit || (OffImm & (Scale-1)))
1315      return false;
1316    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1317  }
1318  EvenReg = Op0->getOperand(0).getReg();
1319  OddReg  = Op1->getOperand(0).getReg();
1320  if (EvenReg == OddReg)
1321    return false;
1322  BaseReg = Op0->getOperand(1).getReg();
1323  if (!isT2)
1324    OffReg = Op0->getOperand(2).getReg();
1325  Pred = llvm::getInstrPredicate(Op0, PredReg);
1326  dl = Op0->getDebugLoc();
1327  return true;
1328}
1329
1330bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1331                                 SmallVector<MachineInstr*, 4> &Ops,
1332                                 unsigned Base, bool isLd,
1333                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1334  bool RetVal = false;
1335
1336  // Sort by offset (in reverse order).
1337  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1338
1339  // The loads / stores of the same base are in order. Scan them from first to
1340  // last and check for the followins:
1341  // 1. Any def of base.
1342  // 2. Any gaps.
1343  while (Ops.size() > 1) {
1344    unsigned FirstLoc = ~0U;
1345    unsigned LastLoc = 0;
1346    MachineInstr *FirstOp = 0;
1347    MachineInstr *LastOp = 0;
1348    int LastOffset = 0;
1349    unsigned LastOpcode = 0;
1350    unsigned LastBytes = 0;
1351    unsigned NumMove = 0;
1352    for (int i = Ops.size() - 1; i >= 0; --i) {
1353      MachineInstr *Op = Ops[i];
1354      unsigned Loc = MI2LocMap[Op];
1355      if (Loc <= FirstLoc) {
1356        FirstLoc = Loc;
1357        FirstOp = Op;
1358      }
1359      if (Loc >= LastLoc) {
1360        LastLoc = Loc;
1361        LastOp = Op;
1362      }
1363
1364      unsigned Opcode = Op->getOpcode();
1365      if (LastOpcode && Opcode != LastOpcode)
1366        break;
1367
1368      int Offset = getMemoryOpOffset(Op);
1369      unsigned Bytes = getLSMultipleTransferSize(Op);
1370      if (LastBytes) {
1371        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1372          break;
1373      }
1374      LastOffset = Offset;
1375      LastBytes = Bytes;
1376      LastOpcode = Opcode;
1377      if (++NumMove == 8) // FIXME: Tune this limit.
1378        break;
1379    }
1380
1381    if (NumMove <= 1)
1382      Ops.pop_back();
1383    else {
1384      SmallPtrSet<MachineInstr*, 4> MemOps;
1385      SmallSet<unsigned, 4> MemRegs;
1386      for (int i = NumMove-1; i >= 0; --i) {
1387        MemOps.insert(Ops[i]);
1388        MemRegs.insert(Ops[i]->getOperand(0).getReg());
1389      }
1390
1391      // Be conservative, if the instructions are too far apart, don't
1392      // move them. We want to limit the increase of register pressure.
1393      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1394      if (DoMove)
1395        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1396                                           MemOps, MemRegs, TRI);
1397      if (!DoMove) {
1398        for (unsigned i = 0; i != NumMove; ++i)
1399          Ops.pop_back();
1400      } else {
1401        // This is the new location for the loads / stores.
1402        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1403        while (InsertPos != MBB->end() && MemOps.count(InsertPos))
1404          ++InsertPos;
1405
1406        // If we are moving a pair of loads / stores, see if it makes sense
1407        // to try to allocate a pair of registers that can form register pairs.
1408        MachineInstr *Op0 = Ops.back();
1409        MachineInstr *Op1 = Ops[Ops.size()-2];
1410        unsigned EvenReg = 0, OddReg = 0;
1411        unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
1412        ARMCC::CondCodes Pred = ARMCC::AL;
1413        bool isT2 = false;
1414        unsigned NewOpc = 0;
1415        int Offset = 0;
1416        DebugLoc dl;
1417        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1418                                             EvenReg, OddReg, BaseReg, OffReg,
1419                                             Offset, PredReg, Pred, isT2)) {
1420          Ops.pop_back();
1421          Ops.pop_back();
1422
1423          // Form the pair instruction.
1424          if (isLd) {
1425            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1426                                              dl, TII->get(NewOpc))
1427              .addReg(EvenReg, RegState::Define)
1428              .addReg(OddReg, RegState::Define)
1429              .addReg(BaseReg);
1430            if (!isT2)
1431              MIB.addReg(OffReg);
1432            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1433            ++NumLDRDFormed;
1434          } else {
1435            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1436                                              dl, TII->get(NewOpc))
1437              .addReg(EvenReg)
1438              .addReg(OddReg)
1439              .addReg(BaseReg);
1440            if (!isT2)
1441              MIB.addReg(OffReg);
1442            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1443            ++NumSTRDFormed;
1444          }
1445          MBB->erase(Op0);
1446          MBB->erase(Op1);
1447
1448          // Add register allocation hints to form register pairs.
1449          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1450          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
1451        } else {
1452          for (unsigned i = 0; i != NumMove; ++i) {
1453            MachineInstr *Op = Ops.back();
1454            Ops.pop_back();
1455            MBB->splice(InsertPos, MBB, Op);
1456          }
1457        }
1458
1459        NumLdStMoved += NumMove;
1460        RetVal = true;
1461      }
1462    }
1463  }
1464
1465  return RetVal;
1466}
1467
1468bool
1469ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1470  bool RetVal = false;
1471
1472  DenseMap<MachineInstr*, unsigned> MI2LocMap;
1473  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1474  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1475  SmallVector<unsigned, 4> LdBases;
1476  SmallVector<unsigned, 4> StBases;
1477
1478  unsigned Loc = 0;
1479  MachineBasicBlock::iterator MBBI = MBB->begin();
1480  MachineBasicBlock::iterator E = MBB->end();
1481  while (MBBI != E) {
1482    for (; MBBI != E; ++MBBI) {
1483      MachineInstr *MI = MBBI;
1484      const TargetInstrDesc &TID = MI->getDesc();
1485      if (TID.isCall() || TID.isTerminator()) {
1486        // Stop at barriers.
1487        ++MBBI;
1488        break;
1489      }
1490
1491      MI2LocMap[MI] = Loc++;
1492      if (!isMemoryOp(MI))
1493        continue;
1494      unsigned PredReg = 0;
1495      if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
1496        continue;
1497
1498      int Opc = MI->getOpcode();
1499      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
1500      unsigned Base = MI->getOperand(1).getReg();
1501      int Offset = getMemoryOpOffset(MI);
1502
1503      bool StopHere = false;
1504      if (isLd) {
1505        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1506          Base2LdsMap.find(Base);
1507        if (BI != Base2LdsMap.end()) {
1508          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1509            if (Offset == getMemoryOpOffset(BI->second[i])) {
1510              StopHere = true;
1511              break;
1512            }
1513          }
1514          if (!StopHere)
1515            BI->second.push_back(MI);
1516        } else {
1517          SmallVector<MachineInstr*, 4> MIs;
1518          MIs.push_back(MI);
1519          Base2LdsMap[Base] = MIs;
1520          LdBases.push_back(Base);
1521        }
1522      } else {
1523        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1524          Base2StsMap.find(Base);
1525        if (BI != Base2StsMap.end()) {
1526          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1527            if (Offset == getMemoryOpOffset(BI->second[i])) {
1528              StopHere = true;
1529              break;
1530            }
1531          }
1532          if (!StopHere)
1533            BI->second.push_back(MI);
1534        } else {
1535          SmallVector<MachineInstr*, 4> MIs;
1536          MIs.push_back(MI);
1537          Base2StsMap[Base] = MIs;
1538          StBases.push_back(Base);
1539        }
1540      }
1541
1542      if (StopHere) {
1543        // Found a duplicate (a base+offset combination that's seen earlier).
1544        // Backtrack.
1545        --Loc;
1546        break;
1547      }
1548    }
1549
1550    // Re-schedule loads.
1551    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1552      unsigned Base = LdBases[i];
1553      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
1554      if (Lds.size() > 1)
1555        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1556    }
1557
1558    // Re-schedule stores.
1559    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1560      unsigned Base = StBases[i];
1561      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
1562      if (Sts.size() > 1)
1563        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1564    }
1565
1566    if (MBBI != E) {
1567      Base2LdsMap.clear();
1568      Base2StsMap.clear();
1569      LdBases.clear();
1570      StBases.clear();
1571    }
1572  }
1573
1574  return RetVal;
1575}
1576
1577
1578/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1579/// optimization pass.
1580FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
1581  if (PreAlloc)
1582    return new ARMPreAllocLoadStoreOpt();
1583  return new ARMLoadStoreOpt();
1584}
1585