ARMLoadStoreOptimizer.cpp revision 27934da97bcb7a6a4949dfc0c449d99f43077e98
1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains a pass that performs load / store related peephole
11// optimizations. This pass should be run after register allocation.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-ldst-opt"
16#include "ARM.h"
17#include "ARMAddressingModes.h"
18#include "ARMMachineFunctionInfo.h"
19#include "ARMRegisterInfo.h"
20#include "llvm/DerivedTypes.h"
21#include "llvm/CodeGen/MachineBasicBlock.h"
22#include "llvm/CodeGen/MachineFunctionPass.h"
23#include "llvm/CodeGen/MachineInstr.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/CodeGen/MachineRegisterInfo.h"
26#include "llvm/CodeGen/RegisterScavenging.h"
27#include "llvm/Target/TargetData.h"
28#include "llvm/Target/TargetInstrInfo.h"
29#include "llvm/Target/TargetMachine.h"
30#include "llvm/Target/TargetRegisterInfo.h"
31#include "llvm/Support/Compiler.h"
32#include "llvm/Support/ErrorHandling.h"
33#include "llvm/ADT/DenseMap.h"
34#include "llvm/ADT/STLExtras.h"
35#include "llvm/ADT/SmallPtrSet.h"
36#include "llvm/ADT/SmallSet.h"
37#include "llvm/ADT/SmallVector.h"
38#include "llvm/ADT/Statistic.h"
39using namespace llvm;
40
41STATISTIC(NumLDMGened , "Number of ldm instructions generated");
42STATISTIC(NumSTMGened , "Number of stm instructions generated");
43STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
44STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
45STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
46STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
47STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
48STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
49STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
50STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
51STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
52
53/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
54/// load / store instructions to form ldm / stm instructions.
55
56namespace {
57  struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
58    static char ID;
59    ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
60
61    const TargetInstrInfo *TII;
62    const TargetRegisterInfo *TRI;
63    ARMFunctionInfo *AFI;
64    RegScavenger *RS;
65    bool isThumb2;
66
67    virtual bool runOnMachineFunction(MachineFunction &Fn);
68
69    virtual const char *getPassName() const {
70      return "ARM load / store optimization pass";
71    }
72
73  private:
74    struct MemOpQueueEntry {
75      int Offset;
76      unsigned Position;
77      MachineBasicBlock::iterator MBBI;
78      bool Merged;
79      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
80        : Offset(o), Position(p), MBBI(i), Merged(false) {};
81    };
82    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
83    typedef MemOpQueue::iterator MemOpQueueIter;
84
85    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
86                  int Offset, unsigned Base, bool BaseKill, int Opcode,
87                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
88                  DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
89    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
90                      int Opcode, unsigned Size,
91                      ARMCC::CondCodes Pred, unsigned PredReg,
92                      unsigned Scratch, MemOpQueue &MemOps,
93                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
94
95    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
96    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
97                             MachineBasicBlock::iterator &MBBI);
98    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
99                                  MachineBasicBlock::iterator MBBI,
100                                  const TargetInstrInfo *TII,
101                                  bool &Advance,
102                                  MachineBasicBlock::iterator &I);
103    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
104                                   MachineBasicBlock::iterator MBBI,
105                                   bool &Advance,
106                                   MachineBasicBlock::iterator &I);
107    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
108    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
109  };
110  char ARMLoadStoreOpt::ID = 0;
111}
112
113static int getLoadStoreMultipleOpcode(int Opcode) {
114  switch (Opcode) {
115  case ARM::LDR:
116    NumLDMGened++;
117    return ARM::LDM;
118  case ARM::STR:
119    NumSTMGened++;
120    return ARM::STM;
121  case ARM::t2LDRi8:
122  case ARM::t2LDRi12:
123    NumLDMGened++;
124    return ARM::t2LDM;
125  case ARM::t2STRi8:
126  case ARM::t2STRi12:
127    NumSTMGened++;
128    return ARM::t2STM;
129  case ARM::FLDS:
130    NumFLDMGened++;
131    return ARM::FLDMS;
132  case ARM::FSTS:
133    NumFSTMGened++;
134    return ARM::FSTMS;
135  case ARM::FLDD:
136    NumFLDMGened++;
137    return ARM::FLDMD;
138  case ARM::FSTD:
139    NumFSTMGened++;
140    return ARM::FSTMD;
141  default: llvm_unreachable("Unhandled opcode!");
142  }
143  return 0;
144}
145
146static bool isT2i32Load(unsigned Opc) {
147  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
148}
149
150static bool isi32Load(unsigned Opc) {
151  return Opc == ARM::LDR || isT2i32Load(Opc);
152}
153
154static bool isT2i32Store(unsigned Opc) {
155  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
156}
157
158static bool isi32Store(unsigned Opc) {
159  return Opc == ARM::STR || isT2i32Store(Opc);
160}
161
162/// MergeOps - Create and insert a LDM or STM with Base as base register and
163/// registers in Regs as the register operands that would be loaded / stored.
164/// It returns true if the transformation is done.
165bool
166ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
167                          MachineBasicBlock::iterator MBBI,
168                          int Offset, unsigned Base, bool BaseKill,
169                          int Opcode, ARMCC::CondCodes Pred,
170                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
171                          SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
172  // Only a single register to load / store. Don't bother.
173  unsigned NumRegs = Regs.size();
174  if (NumRegs <= 1)
175    return false;
176
177  ARM_AM::AMSubMode Mode = ARM_AM::ia;
178  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
179  if (isAM4 && Offset == 4)
180    Mode = ARM_AM::ib;
181  else if (isAM4 && Offset == -4 * (int)NumRegs + 4)
182    Mode = ARM_AM::da;
183  else if (isAM4 && Offset == -4 * (int)NumRegs)
184    Mode = ARM_AM::db;
185  else if (Offset != 0) {
186    // If starting offset isn't zero, insert a MI to materialize a new base.
187    // But only do so if it is cost effective, i.e. merging more than two
188    // loads / stores.
189    if (NumRegs <= 2)
190      return false;
191
192    unsigned NewBase;
193    if (isi32Load(Opcode))
194      // If it is a load, then just use one of the destination register to
195      // use as the new base.
196      NewBase = Regs[NumRegs-1].first;
197    else {
198      // Use the scratch register to use as a new base.
199      NewBase = Scratch;
200      if (NewBase == 0)
201        return false;
202    }
203    int BaseOpc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
204    if (Offset < 0) {
205      BaseOpc = isThumb2 ? ARM::t2SUBri : ARM::SUBri;
206      Offset = - Offset;
207    }
208    int ImmedOffset = isThumb2
209      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
210    if (ImmedOffset == -1)
211      // FIXME: Try t2ADDri12 or t2SUBri12?
212      return false;  // Probably not worth it then.
213
214    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
215      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
216      .addImm(Pred).addReg(PredReg).addReg(0);
217    Base = NewBase;
218    BaseKill = true;  // New base is always killed right its use.
219  }
220
221  bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
222  bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
223  Opcode = getLoadStoreMultipleOpcode(Opcode);
224  MachineInstrBuilder MIB = (isAM4)
225    ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
226        .addReg(Base, getKillRegState(BaseKill))
227        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
228    : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
229        .addReg(Base, getKillRegState(BaseKill))
230        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
231        .addImm(Pred).addReg(PredReg);
232  for (unsigned i = 0; i != NumRegs; ++i)
233    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
234                     | getKillRegState(Regs[i].second));
235
236  return true;
237}
238
239/// MergeLDR_STR - Merge a number of load / store instructions into one or more
240/// load / store multiple instructions.
241void
242ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
243                          unsigned Base, int Opcode, unsigned Size,
244                          ARMCC::CondCodes Pred, unsigned PredReg,
245                          unsigned Scratch, MemOpQueue &MemOps,
246                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
247  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
248  int Offset = MemOps[SIndex].Offset;
249  int SOffset = Offset;
250  unsigned Pos = MemOps[SIndex].Position;
251  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
252  DebugLoc dl = Loc->getDebugLoc();
253  unsigned PReg = Loc->getOperand(0).getReg();
254  unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
255  bool isKill = Loc->getOperand(0).isKill();
256
257  SmallVector<std::pair<unsigned,bool>, 8> Regs;
258  Regs.push_back(std::make_pair(PReg, isKill));
259  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
260    int NewOffset = MemOps[i].Offset;
261    unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg();
262    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
263    isKill = MemOps[i].MBBI->getOperand(0).isKill();
264    // AM4 - register numbers in ascending order.
265    // AM5 - consecutive register numbers in ascending order.
266    if (NewOffset == Offset + (int)Size &&
267        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
268      Offset += Size;
269      Regs.push_back(std::make_pair(Reg, isKill));
270      PRegNum = RegNum;
271    } else {
272      // Can't merge this in. Try merge the earlier ones first.
273      if (MergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
274                   Scratch, dl, Regs)) {
275        Merges.push_back(prior(Loc));
276        for (unsigned j = SIndex; j < i; ++j) {
277          MBB.erase(MemOps[j].MBBI);
278          MemOps[j].Merged = true;
279        }
280      }
281      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
282                   MemOps, Merges);
283      return;
284    }
285
286    if (MemOps[i].Position > Pos) {
287      Pos = MemOps[i].Position;
288      Loc = MemOps[i].MBBI;
289    }
290  }
291
292  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
293  if (MergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
294               Scratch, dl, Regs)) {
295    Merges.push_back(prior(Loc));
296    for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) {
297      MBB.erase(MemOps[i].MBBI);
298      MemOps[i].Merged = true;
299    }
300  }
301
302  return;
303}
304
305/// getInstrPredicate - If instruction is predicated, returns its predicate
306/// condition, otherwise returns AL. It also returns the condition code
307/// register by reference.
308static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) {
309  int PIdx = MI->findFirstPredOperandIdx();
310  if (PIdx == -1) {
311    PredReg = 0;
312    return ARMCC::AL;
313  }
314
315  PredReg = MI->getOperand(PIdx+1).getReg();
316  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
317}
318
319static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
320                                       unsigned Bytes, unsigned Limit,
321                                       ARMCC::CondCodes Pred, unsigned PredReg){
322  unsigned MyPredReg = 0;
323  if (!MI)
324    return false;
325  if (MI->getOpcode() != ARM::t2SUBri &&
326      MI->getOpcode() != ARM::SUBri)
327    return false;
328
329  // Make sure the offset fits in 8 bits.
330  if (Bytes <= 0 || (Limit && Bytes >= Limit))
331    return false;
332
333  return (MI->getOperand(0).getReg() == Base &&
334          MI->getOperand(1).getReg() == Base &&
335          MI->getOperand(2).getImm() == Bytes &&
336          getInstrPredicate(MI, MyPredReg) == Pred &&
337          MyPredReg == PredReg);
338}
339
340static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
341                                       unsigned Bytes, unsigned Limit,
342                                       ARMCC::CondCodes Pred, unsigned PredReg){
343  unsigned MyPredReg = 0;
344  if (!MI)
345    return false;
346  if (MI->getOpcode() != ARM::t2ADDri &&
347      MI->getOpcode() != ARM::ADDri)
348    return false;
349
350  if (Bytes <= 0 || (Limit && Bytes >= Limit))
351    // Make sure the offset fits in 8 bits.
352    return false;
353
354  return (MI->getOperand(0).getReg() == Base &&
355          MI->getOperand(1).getReg() == Base &&
356          MI->getOperand(2).getImm() == Bytes &&
357          getInstrPredicate(MI, MyPredReg) == Pred &&
358          MyPredReg == PredReg);
359}
360
361static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
362  switch (MI->getOpcode()) {
363  default: return 0;
364  case ARM::LDR:
365  case ARM::STR:
366  case ARM::t2LDRi8:
367  case ARM::t2LDRi12:
368  case ARM::t2STRi8:
369  case ARM::t2STRi12:
370  case ARM::FLDS:
371  case ARM::FSTS:
372    return 4;
373  case ARM::FLDD:
374  case ARM::FSTD:
375    return 8;
376  case ARM::LDM:
377  case ARM::STM:
378  case ARM::t2LDM:
379  case ARM::t2STM:
380    return (MI->getNumOperands() - 4) * 4;
381  case ARM::FLDMS:
382  case ARM::FSTMS:
383  case ARM::FLDMD:
384  case ARM::FSTMD:
385    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
386  }
387}
388
389/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
390/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
391///
392/// stmia rn, <ra, rb, rc>
393/// rn := rn + 4 * 3;
394/// =>
395/// stmia rn!, <ra, rb, rc>
396///
397/// rn := rn - 4 * 3;
398/// ldmia rn, <ra, rb, rc>
399/// =>
400/// ldmdb rn!, <ra, rb, rc>
401bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
402                                               MachineBasicBlock::iterator MBBI,
403                                               bool &Advance,
404                                               MachineBasicBlock::iterator &I) {
405  MachineInstr *MI = MBBI;
406  unsigned Base = MI->getOperand(0).getReg();
407  unsigned Bytes = getLSMultipleTransferSize(MI);
408  unsigned PredReg = 0;
409  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
410  int Opcode = MI->getOpcode();
411  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
412    Opcode == ARM::STM || Opcode == ARM::t2STM;
413
414  if (isAM4) {
415    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
416      return false;
417
418    // Can't use the updating AM4 sub-mode if the base register is also a dest
419    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
420    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
421      if (MI->getOperand(i).getReg() == Base)
422        return false;
423    }
424
425    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
426    if (MBBI != MBB.begin()) {
427      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
428      if (Mode == ARM_AM::ia &&
429          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
430        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
431        MBB.erase(PrevMBBI);
432        return true;
433      } else if (Mode == ARM_AM::ib &&
434                 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
435        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
436        MBB.erase(PrevMBBI);
437        return true;
438      }
439    }
440
441    if (MBBI != MBB.end()) {
442      MachineBasicBlock::iterator NextMBBI = next(MBBI);
443      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
444          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
445        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
446        if (NextMBBI == I) {
447          Advance = true;
448          ++I;
449        }
450        MBB.erase(NextMBBI);
451        return true;
452      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
453                 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
454        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
455        if (NextMBBI == I) {
456          Advance = true;
457          ++I;
458        }
459        MBB.erase(NextMBBI);
460        return true;
461      }
462    }
463  } else {
464    // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops.
465    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
466      return false;
467
468    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
469    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
470    if (MBBI != MBB.begin()) {
471      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
472      if (Mode == ARM_AM::ia &&
473          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
474        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
475        MBB.erase(PrevMBBI);
476        return true;
477      }
478    }
479
480    if (MBBI != MBB.end()) {
481      MachineBasicBlock::iterator NextMBBI = next(MBBI);
482      if (Mode == ARM_AM::ia &&
483          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
484        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
485        if (NextMBBI == I) {
486          Advance = true;
487          ++I;
488        }
489        MBB.erase(NextMBBI);
490      }
491      return true;
492    }
493  }
494
495  return false;
496}
497
498static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
499  switch (Opc) {
500  case ARM::LDR: return ARM::LDR_PRE;
501  case ARM::STR: return ARM::STR_PRE;
502  case ARM::FLDS: return ARM::FLDMS;
503  case ARM::FLDD: return ARM::FLDMD;
504  case ARM::FSTS: return ARM::FSTMS;
505  case ARM::FSTD: return ARM::FSTMD;
506  case ARM::t2LDRi8:
507  case ARM::t2LDRi12:
508    return ARM::t2LDR_PRE;
509  case ARM::t2STRi8:
510  case ARM::t2STRi12:
511    return ARM::t2STR_PRE;
512  default: llvm_unreachable("Unhandled opcode!");
513  }
514  return 0;
515}
516
517static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
518  switch (Opc) {
519  case ARM::LDR: return ARM::LDR_POST;
520  case ARM::STR: return ARM::STR_POST;
521  case ARM::FLDS: return ARM::FLDMS;
522  case ARM::FLDD: return ARM::FLDMD;
523  case ARM::FSTS: return ARM::FSTMS;
524  case ARM::FSTD: return ARM::FSTMD;
525  case ARM::t2LDRi8:
526  case ARM::t2LDRi12:
527    return ARM::t2LDR_POST;
528  case ARM::t2STRi8:
529  case ARM::t2STRi12:
530    return ARM::t2STR_POST;
531  default: llvm_unreachable("Unhandled opcode!");
532  }
533  return 0;
534}
535
536/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
537/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
538bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
539                                               MachineBasicBlock::iterator MBBI,
540                                               const TargetInstrInfo *TII,
541                                               bool &Advance,
542                                               MachineBasicBlock::iterator &I) {
543  MachineInstr *MI = MBBI;
544  unsigned Base = MI->getOperand(1).getReg();
545  bool BaseKill = MI->getOperand(1).isKill();
546  unsigned Bytes = getLSMultipleTransferSize(MI);
547  int Opcode = MI->getOpcode();
548  DebugLoc dl = MI->getDebugLoc();
549  bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS ||
550    Opcode == ARM::FSTD || Opcode == ARM::FSTS;
551  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
552  if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
553    return false;
554  else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
555    return false;
556  else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
557    if (MI->getOperand(2).getImm() != 0)
558      return false;
559
560  bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
561  // Can't do the merge if the destination register is the same as the would-be
562  // writeback register.
563  if (isLd && MI->getOperand(0).getReg() == Base)
564    return false;
565
566  unsigned PredReg = 0;
567  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
568  bool DoMerge = false;
569  ARM_AM::AddrOpc AddSub = ARM_AM::add;
570  unsigned NewOpc = 0;
571  // AM2 - 12 bits, thumb2 - 8 bits.
572  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
573  if (MBBI != MBB.begin()) {
574    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
575    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
576      DoMerge = true;
577      AddSub = ARM_AM::sub;
578      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
579    } else if (!isAM5 &&
580               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
581      DoMerge = true;
582      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
583    }
584    if (DoMerge)
585      MBB.erase(PrevMBBI);
586  }
587
588  if (!DoMerge && MBBI != MBB.end()) {
589    MachineBasicBlock::iterator NextMBBI = next(MBBI);
590    if (!isAM5 &&
591        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
592      DoMerge = true;
593      AddSub = ARM_AM::sub;
594      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
595    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
596      DoMerge = true;
597      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
598    }
599    if (DoMerge) {
600      if (NextMBBI == I) {
601        Advance = true;
602        ++I;
603      }
604      MBB.erase(NextMBBI);
605    }
606  }
607
608  if (!DoMerge)
609    return false;
610
611  bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
612  unsigned Offset = isAM5
613    ? ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia,
614                        true, isDPR ? 2 : 1)
615    : (isAM2
616       ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift)
617       : Bytes);
618  if (isLd) {
619    if (isAM5)
620      // FLDMS, FLDMD
621      BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
622        .addReg(Base, getKillRegState(BaseKill))
623        .addImm(Offset).addImm(Pred).addReg(PredReg)
624        .addReg(MI->getOperand(0).getReg(), RegState::Define);
625    else if (isAM2)
626      // LDR_PRE, LDR_POST,
627      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
628        .addReg(Base, RegState::Define)
629        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
630    else
631      // t2LDR_PRE, t2LDR_POST
632      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
633        .addReg(Base, RegState::Define)
634        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
635  } else {
636    MachineOperand &MO = MI->getOperand(0);
637    if (isAM5)
638      // FSTMS, FSTMD
639      BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
640        .addImm(Pred).addReg(PredReg)
641        .addReg(MO.getReg(), getKillRegState(MO.isKill()));
642    else if (isAM2)
643      // STR_PRE, STR_POST
644      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
645        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
646        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
647    else
648      // t2STR_PRE, t2STR_POST
649      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
650        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
651        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
652  }
653  MBB.erase(MBBI);
654
655  return true;
656}
657
658/// isMemoryOp - Returns true if instruction is a memory operations (that this
659/// pass is capable of operating on).
660static bool isMemoryOp(const MachineInstr *MI) {
661  int Opcode = MI->getOpcode();
662  switch (Opcode) {
663  default: break;
664  case ARM::LDR:
665  case ARM::STR:
666    return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
667  case ARM::FLDS:
668  case ARM::FSTS:
669    return MI->getOperand(1).isReg();
670  case ARM::FLDD:
671  case ARM::FSTD:
672    return MI->getOperand(1).isReg();
673  case ARM::t2LDRi8:
674  case ARM::t2LDRi12:
675  case ARM::t2STRi8:
676  case ARM::t2STRi12:
677    return true;
678  }
679  return false;
680}
681
682/// AdvanceRS - Advance register scavenger to just before the earliest memory
683/// op that is being merged.
684void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
685  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
686  unsigned Position = MemOps[0].Position;
687  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
688    if (MemOps[i].Position < Position) {
689      Position = MemOps[i].Position;
690      Loc = MemOps[i].MBBI;
691    }
692  }
693
694  if (Loc != MBB.begin())
695    RS->forward(prior(Loc));
696}
697
698static int getMemoryOpOffset(const MachineInstr *MI) {
699  int Opcode = MI->getOpcode();
700  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
701  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
702  unsigned NumOperands = MI->getDesc().getNumOperands();
703  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
704
705  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
706      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
707      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
708    return OffField;
709
710  int Offset = isAM2
711    ? ARM_AM::getAM2Offset(OffField)
712    : (isAM3 ? ARM_AM::getAM3Offset(OffField)
713             : ARM_AM::getAM5Offset(OffField) * 4);
714  if (isAM2) {
715    if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
716      Offset = -Offset;
717  } else if (isAM3) {
718    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
719      Offset = -Offset;
720  } else {
721    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
722      Offset = -Offset;
723  }
724  return Offset;
725}
726
727static void InsertLDR_STR(MachineBasicBlock &MBB,
728                          MachineBasicBlock::iterator &MBBI,
729                          int OffImm, bool isDef,
730                          DebugLoc dl, unsigned NewOpc,
731                          unsigned Reg, bool RegDeadKill,
732                          unsigned BaseReg, bool BaseKill,
733                          unsigned OffReg, bool OffKill,
734                          ARMCC::CondCodes Pred, unsigned PredReg,
735                          const TargetInstrInfo *TII) {
736  unsigned Offset;
737  if (OffImm < 0)
738    Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
739  else
740    Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
741  if (isDef)
742    BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
743      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
744      .addReg(BaseReg, getKillRegState(BaseKill))
745      .addReg(OffReg,  getKillRegState(OffKill))
746      .addImm(Offset)
747      .addImm(Pred).addReg(PredReg);
748  else
749    BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
750      .addReg(Reg, getKillRegState(RegDeadKill))
751      .addReg(BaseReg, getKillRegState(BaseKill))
752      .addReg(OffReg,  getKillRegState(OffKill))
753      .addImm(Offset)
754      .addImm(Pred).addReg(PredReg);
755}
756
757bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
758                                          MachineBasicBlock::iterator &MBBI) {
759  MachineInstr *MI = &*MBBI;
760  unsigned Opcode = MI->getOpcode();
761  if (Opcode == ARM::LDRD || Opcode == ARM::STRD) {
762    unsigned EvenReg = MI->getOperand(0).getReg();
763    unsigned OddReg  = MI->getOperand(1).getReg();
764    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
765    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
766    if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
767      return false;
768
769    bool isLd = Opcode == ARM::LDRD;
770    bool EvenDeadKill = isLd ?
771      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
772    bool OddDeadKill  = isLd ?
773      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
774    const MachineOperand &BaseOp = MI->getOperand(2);
775    unsigned BaseReg = BaseOp.getReg();
776    bool BaseKill = BaseOp.isKill();
777    const MachineOperand &OffOp = MI->getOperand(3);
778    unsigned OffReg = OffOp.getReg();
779    bool OffKill = OffOp.isKill();
780    int OffImm = getMemoryOpOffset(MI);
781    unsigned PredReg = 0;
782    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
783
784    if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
785      // Ascending register numbers and no offset. It's safe to change it to a
786      // ldm or stm.
787      unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDM : ARM::STM;
788      if (isLd) {
789        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
790          .addReg(BaseReg, getKillRegState(BaseKill))
791          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
792          .addImm(Pred).addReg(PredReg)
793          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
794          .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
795        ++NumLDRD2LDM;
796      } else {
797        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
798          .addReg(BaseReg, getKillRegState(BaseKill))
799          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
800          .addImm(Pred).addReg(PredReg)
801          .addReg(EvenReg, getKillRegState(EvenDeadKill))
802          .addReg(OddReg, getKillRegState(OddDeadKill));
803        ++NumSTRD2STM;
804      }
805    } else {
806      // Split into two instructions.
807      unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDR : ARM::STR;
808      DebugLoc dl = MBBI->getDebugLoc();
809      // If this is a load and base register is killed, it may have been
810      // re-defed by the load, make sure the first load does not clobber it.
811      if (isLd &&
812          (BaseKill || OffKill) &&
813          (TRI->regsOverlap(EvenReg, BaseReg) ||
814           (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
815        assert(!TRI->regsOverlap(OddReg, BaseReg) &&
816               (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
817        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill,
818                      BaseReg, false, OffReg, false, Pred, PredReg, TII);
819        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill,
820                      BaseReg, BaseKill, OffReg, OffKill, Pred, PredReg, TII);
821      } else {
822        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
823                      EvenReg, EvenDeadKill, BaseReg, false, OffReg, false,
824                      Pred, PredReg, TII);
825        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
826                      OddReg, OddDeadKill, BaseReg, BaseKill, OffReg, OffKill,
827                      Pred, PredReg, TII);
828      }
829      if (isLd)
830        ++NumLDRD2LDR;
831      else
832        ++NumSTRD2STR;
833    }
834
835    MBBI = prior(MBBI);
836    MBB.erase(MI);
837  }
838  return false;
839}
840
841/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
842/// ops of the same base and incrementing offset into LDM / STM ops.
843bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
844  unsigned NumMerges = 0;
845  unsigned NumMemOps = 0;
846  MemOpQueue MemOps;
847  unsigned CurrBase = 0;
848  int CurrOpc = -1;
849  unsigned CurrSize = 0;
850  ARMCC::CondCodes CurrPred = ARMCC::AL;
851  unsigned CurrPredReg = 0;
852  unsigned Position = 0;
853  SmallVector<MachineBasicBlock::iterator,4> Merges;
854
855  RS->enterBasicBlock(&MBB);
856  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
857  while (MBBI != E) {
858    if (FixInvalidRegPairOp(MBB, MBBI))
859      continue;
860
861    bool Advance  = false;
862    bool TryMerge = false;
863    bool Clobber  = false;
864
865    bool isMemOp = isMemoryOp(MBBI);
866    if (isMemOp) {
867      int Opcode = MBBI->getOpcode();
868      unsigned Size = getLSMultipleTransferSize(MBBI);
869      unsigned Base = MBBI->getOperand(1).getReg();
870      unsigned PredReg = 0;
871      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
872      int Offset = getMemoryOpOffset(MBBI);
873      // Watch out for:
874      // r4 := ldr [r5]
875      // r5 := ldr [r5, #4]
876      // r6 := ldr [r5, #8]
877      //
878      // The second ldr has effectively broken the chain even though it
879      // looks like the later ldr(s) use the same base register. Try to
880      // merge the ldr's so far, including this one. But don't try to
881      // combine the following ldr(s).
882      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
883      if (CurrBase == 0 && !Clobber) {
884        // Start of a new chain.
885        CurrBase = Base;
886        CurrOpc  = Opcode;
887        CurrSize = Size;
888        CurrPred = Pred;
889        CurrPredReg = PredReg;
890        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
891        NumMemOps++;
892        Advance = true;
893      } else {
894        if (Clobber) {
895          TryMerge = true;
896          Advance = true;
897        }
898
899        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
900          // No need to match PredReg.
901          // Continue adding to the queue.
902          if (Offset > MemOps.back().Offset) {
903            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
904            NumMemOps++;
905            Advance = true;
906          } else {
907            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
908                 I != E; ++I) {
909              if (Offset < I->Offset) {
910                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
911                NumMemOps++;
912                Advance = true;
913                break;
914              } else if (Offset == I->Offset) {
915                // Collision! This can't be merged!
916                break;
917              }
918            }
919          }
920        }
921      }
922    }
923
924    if (Advance) {
925      ++Position;
926      ++MBBI;
927    } else
928      TryMerge = true;
929
930    if (TryMerge) {
931      if (NumMemOps > 1) {
932        // Try to find a free register to use as a new base in case it's needed.
933        // First advance to the instruction just before the start of the chain.
934        AdvanceRS(MBB, MemOps);
935        // Find a scratch register. Make sure it's a call clobbered register or
936        // a spilled callee-saved register.
937        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true);
938        if (!Scratch)
939          Scratch = RS->FindUnusedReg(&ARM::GPRRegClass,
940                                      AFI->getSpilledCSRegisters());
941        // Process the load / store instructions.
942        RS->forward(prior(MBBI));
943
944        // Merge ops.
945        Merges.clear();
946        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
947                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
948
949        // Try folding preceeding/trailing base inc/dec into the generated
950        // LDM/STM ops.
951        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
952          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
953            ++NumMerges;
954        NumMerges += Merges.size();
955
956        // Try folding preceeding/trailing base inc/dec into those load/store
957        // that were not merged to form LDM/STM ops.
958        for (unsigned i = 0; i != NumMemOps; ++i)
959          if (!MemOps[i].Merged)
960            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
961              ++NumMerges;
962
963        // RS may be pointing to an instruction that's deleted.
964        RS->skipTo(prior(MBBI));
965      } else if (NumMemOps == 1) {
966        // Try folding preceeding/trailing base inc/dec into the single
967        // load/store.
968        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
969          ++NumMerges;
970          RS->forward(prior(MBBI));
971        }
972      }
973
974      CurrBase = 0;
975      CurrOpc = -1;
976      CurrSize = 0;
977      CurrPred = ARMCC::AL;
978      CurrPredReg = 0;
979      if (NumMemOps) {
980        MemOps.clear();
981        NumMemOps = 0;
982      }
983
984      // If iterator hasn't been advanced and this is not a memory op, skip it.
985      // It can't start a new chain anyway.
986      if (!Advance && !isMemOp && MBBI != E) {
987        ++Position;
988        ++MBBI;
989      }
990    }
991  }
992  return NumMerges > 0;
993}
994
995namespace {
996  struct OffsetCompare {
997    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
998      int LOffset = getMemoryOpOffset(LHS);
999      int ROffset = getMemoryOpOffset(RHS);
1000      assert(LHS == RHS || LOffset != ROffset);
1001      return LOffset > ROffset;
1002    }
1003  };
1004}
1005
1006/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
1007/// (bx lr) into the preceeding stack restore so it directly restore the value
1008/// of LR into pc.
1009///   ldmfd sp!, {r7, lr}
1010///   bx lr
1011/// =>
1012///   ldmfd sp!, {r7, pc}
1013bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1014  if (MBB.empty()) return false;
1015
1016  MachineBasicBlock::iterator MBBI = prior(MBB.end());
1017  if (MBBI != MBB.begin() &&
1018      (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
1019    MachineInstr *PrevMI = prior(MBBI);
1020    if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
1021      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1022      if (MO.getReg() != ARM::LR)
1023        return false;
1024      unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
1025      PrevMI->setDesc(TII->get(NewOpc));
1026      MO.setReg(ARM::PC);
1027      MBB.erase(MBBI);
1028      return true;
1029    }
1030  }
1031  return false;
1032}
1033
1034bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1035  const TargetMachine &TM = Fn.getTarget();
1036  AFI = Fn.getInfo<ARMFunctionInfo>();
1037  TII = TM.getInstrInfo();
1038  TRI = TM.getRegisterInfo();
1039  RS = new RegScavenger();
1040  isThumb2 = AFI->isThumb2Function();
1041
1042  bool Modified = false;
1043  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1044       ++MFI) {
1045    MachineBasicBlock &MBB = *MFI;
1046    Modified |= LoadStoreMultipleOpti(MBB);
1047    Modified |= MergeReturnIntoLDM(MBB);
1048  }
1049
1050  delete RS;
1051  return Modified;
1052}
1053
1054
1055/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1056/// load / stores from consecutive locations close to make it more
1057/// likely they will be combined later.
1058
1059namespace {
1060  struct VISIBILITY_HIDDEN ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1061    static char ID;
1062    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
1063
1064    const TargetData *TD;
1065    const TargetInstrInfo *TII;
1066    const TargetRegisterInfo *TRI;
1067    const ARMSubtarget *STI;
1068    MachineRegisterInfo *MRI;
1069
1070    virtual bool runOnMachineFunction(MachineFunction &Fn);
1071
1072    virtual const char *getPassName() const {
1073      return "ARM pre- register allocation load / store optimization pass";
1074    }
1075
1076  private:
1077    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1078                          unsigned &NewOpc, unsigned &EvenReg,
1079                          unsigned &OddReg, unsigned &BaseReg,
1080                          unsigned &OffReg, unsigned &Offset,
1081                          unsigned &PredReg, ARMCC::CondCodes &Pred);
1082    bool RescheduleOps(MachineBasicBlock *MBB,
1083                       SmallVector<MachineInstr*, 4> &Ops,
1084                       unsigned Base, bool isLd,
1085                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1086    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1087  };
1088  char ARMPreAllocLoadStoreOpt::ID = 0;
1089}
1090
1091bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1092  TD  = Fn.getTarget().getTargetData();
1093  TII = Fn.getTarget().getInstrInfo();
1094  TRI = Fn.getTarget().getRegisterInfo();
1095  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1096  MRI = &Fn.getRegInfo();
1097
1098  bool Modified = false;
1099  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1100       ++MFI)
1101    Modified |= RescheduleLoadStoreInstrs(MFI);
1102
1103  return Modified;
1104}
1105
1106static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1107                                      MachineBasicBlock::iterator I,
1108                                      MachineBasicBlock::iterator E,
1109                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
1110                                      SmallSet<unsigned, 4> &MemRegs,
1111                                      const TargetRegisterInfo *TRI) {
1112  // Are there stores / loads / calls between them?
1113  // FIXME: This is overly conservative. We should make use of alias information
1114  // some day.
1115  SmallSet<unsigned, 4> AddedRegPressure;
1116  while (++I != E) {
1117    if (MemOps.count(&*I))
1118      continue;
1119    const TargetInstrDesc &TID = I->getDesc();
1120    if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
1121      return false;
1122    if (isLd && TID.mayStore())
1123      return false;
1124    if (!isLd) {
1125      if (TID.mayLoad())
1126        return false;
1127      // It's not safe to move the first 'str' down.
1128      // str r1, [r0]
1129      // strh r5, [r0]
1130      // str r4, [r0, #+4]
1131      if (TID.mayStore())
1132        return false;
1133    }
1134    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1135      MachineOperand &MO = I->getOperand(j);
1136      if (!MO.isReg())
1137        continue;
1138      unsigned Reg = MO.getReg();
1139      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1140        return false;
1141      if (Reg != Base && !MemRegs.count(Reg))
1142        AddedRegPressure.insert(Reg);
1143    }
1144  }
1145
1146  // Estimate register pressure increase due to the transformation.
1147  if (MemRegs.size() <= 4)
1148    // Ok if we are moving small number of instructions.
1149    return true;
1150  return AddedRegPressure.size() <= MemRegs.size() * 2;
1151}
1152
1153bool
1154ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1155                                          DebugLoc &dl,
1156                                          unsigned &NewOpc, unsigned &EvenReg,
1157                                          unsigned &OddReg, unsigned &BaseReg,
1158                                          unsigned &OffReg, unsigned &Offset,
1159                                          unsigned &PredReg,
1160                                          ARMCC::CondCodes &Pred) {
1161  // FIXME: FLDS / FSTS -> FLDD / FSTD
1162  unsigned Opcode = Op0->getOpcode();
1163  if (Opcode == ARM::LDR)
1164    NewOpc = ARM::LDRD;
1165  else if (Opcode == ARM::STR)
1166    NewOpc = ARM::STRD;
1167  else
1168    return 0;
1169
1170  // Must sure the base address satisfies i64 ld / st alignment requirement.
1171  if (!Op0->hasOneMemOperand() ||
1172      !Op0->memoperands_begin()->getValue() ||
1173      Op0->memoperands_begin()->isVolatile())
1174    return false;
1175
1176  unsigned Align = Op0->memoperands_begin()->getAlignment();
1177  unsigned ReqAlign = STI->hasV6Ops()
1178    ? TD->getPrefTypeAlignment(Type::Int64Ty) : 8; // Pre-v6 need 8-byte align
1179  if (Align < ReqAlign)
1180    return false;
1181
1182  // Then make sure the immediate offset fits.
1183  int OffImm = getMemoryOpOffset(Op0);
1184  ARM_AM::AddrOpc AddSub = ARM_AM::add;
1185  if (OffImm < 0) {
1186    AddSub = ARM_AM::sub;
1187    OffImm = - OffImm;
1188  }
1189  if (OffImm >= 256) // 8 bits
1190    return false;
1191  Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1192
1193  EvenReg = Op0->getOperand(0).getReg();
1194  OddReg  = Op1->getOperand(0).getReg();
1195  if (EvenReg == OddReg)
1196    return false;
1197  BaseReg = Op0->getOperand(1).getReg();
1198  OffReg = Op0->getOperand(2).getReg();
1199  Pred = getInstrPredicate(Op0, PredReg);
1200  dl = Op0->getDebugLoc();
1201  return true;
1202}
1203
1204bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1205                                 SmallVector<MachineInstr*, 4> &Ops,
1206                                 unsigned Base, bool isLd,
1207                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1208  bool RetVal = false;
1209
1210  // Sort by offset (in reverse order).
1211  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1212
1213  // The loads / stores of the same base are in order. Scan them from first to
1214  // last and check for the followins:
1215  // 1. Any def of base.
1216  // 2. Any gaps.
1217  while (Ops.size() > 1) {
1218    unsigned FirstLoc = ~0U;
1219    unsigned LastLoc = 0;
1220    MachineInstr *FirstOp = 0;
1221    MachineInstr *LastOp = 0;
1222    int LastOffset = 0;
1223    unsigned LastOpcode = 0;
1224    unsigned LastBytes = 0;
1225    unsigned NumMove = 0;
1226    for (int i = Ops.size() - 1; i >= 0; --i) {
1227      MachineInstr *Op = Ops[i];
1228      unsigned Loc = MI2LocMap[Op];
1229      if (Loc <= FirstLoc) {
1230        FirstLoc = Loc;
1231        FirstOp = Op;
1232      }
1233      if (Loc >= LastLoc) {
1234        LastLoc = Loc;
1235        LastOp = Op;
1236      }
1237
1238      unsigned Opcode = Op->getOpcode();
1239      if (LastOpcode && Opcode != LastOpcode)
1240        break;
1241
1242      int Offset = getMemoryOpOffset(Op);
1243      unsigned Bytes = getLSMultipleTransferSize(Op);
1244      if (LastBytes) {
1245        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1246          break;
1247      }
1248      LastOffset = Offset;
1249      LastBytes = Bytes;
1250      LastOpcode = Opcode;
1251      if (++NumMove == 8) // FIXME: Tune
1252        break;
1253    }
1254
1255    if (NumMove <= 1)
1256      Ops.pop_back();
1257    else {
1258      SmallPtrSet<MachineInstr*, 4> MemOps;
1259      SmallSet<unsigned, 4> MemRegs;
1260      for (int i = NumMove-1; i >= 0; --i) {
1261        MemOps.insert(Ops[i]);
1262        MemRegs.insert(Ops[i]->getOperand(0).getReg());
1263      }
1264
1265      // Be conservative, if the instructions are too far apart, don't
1266      // move them. We want to limit the increase of register pressure.
1267      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1268      if (DoMove)
1269        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1270                                           MemOps, MemRegs, TRI);
1271      if (!DoMove) {
1272        for (unsigned i = 0; i != NumMove; ++i)
1273          Ops.pop_back();
1274      } else {
1275        // This is the new location for the loads / stores.
1276        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1277        while (InsertPos != MBB->end() && MemOps.count(InsertPos))
1278          ++InsertPos;
1279
1280        // If we are moving a pair of loads / stores, see if it makes sense
1281        // to try to allocate a pair of registers that can form register pairs.
1282        MachineInstr *Op0 = Ops.back();
1283        MachineInstr *Op1 = Ops[Ops.size()-2];
1284        unsigned EvenReg = 0, OddReg = 0;
1285        unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
1286        ARMCC::CondCodes Pred = ARMCC::AL;
1287        unsigned NewOpc = 0;
1288        unsigned Offset = 0;
1289        DebugLoc dl;
1290        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1291                                             EvenReg, OddReg, BaseReg, OffReg,
1292                                             Offset, PredReg, Pred)) {
1293          Ops.pop_back();
1294          Ops.pop_back();
1295
1296          // Form the pair instruction.
1297          if (isLd) {
1298            BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc))
1299              .addReg(EvenReg, RegState::Define)
1300              .addReg(OddReg, RegState::Define)
1301              .addReg(BaseReg).addReg(0).addImm(Offset)
1302              .addImm(Pred).addReg(PredReg);
1303            ++NumLDRDFormed;
1304          } else {
1305            BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc))
1306              .addReg(EvenReg)
1307              .addReg(OddReg)
1308              .addReg(BaseReg).addReg(0).addImm(Offset)
1309              .addImm(Pred).addReg(PredReg);
1310            ++NumSTRDFormed;
1311          }
1312          MBB->erase(Op0);
1313          MBB->erase(Op1);
1314
1315          // Add register allocation hints to form register pairs.
1316          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1317          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
1318        } else {
1319          for (unsigned i = 0; i != NumMove; ++i) {
1320            MachineInstr *Op = Ops.back();
1321            Ops.pop_back();
1322            MBB->splice(InsertPos, MBB, Op);
1323          }
1324        }
1325
1326        NumLdStMoved += NumMove;
1327        RetVal = true;
1328      }
1329    }
1330  }
1331
1332  return RetVal;
1333}
1334
1335bool
1336ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1337  bool RetVal = false;
1338
1339  DenseMap<MachineInstr*, unsigned> MI2LocMap;
1340  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1341  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1342  SmallVector<unsigned, 4> LdBases;
1343  SmallVector<unsigned, 4> StBases;
1344
1345  unsigned Loc = 0;
1346  MachineBasicBlock::iterator MBBI = MBB->begin();
1347  MachineBasicBlock::iterator E = MBB->end();
1348  while (MBBI != E) {
1349    for (; MBBI != E; ++MBBI) {
1350      MachineInstr *MI = MBBI;
1351      const TargetInstrDesc &TID = MI->getDesc();
1352      if (TID.isCall() || TID.isTerminator()) {
1353        // Stop at barriers.
1354        ++MBBI;
1355        break;
1356      }
1357
1358      MI2LocMap[MI] = Loc++;
1359      if (!isMemoryOp(MI))
1360        continue;
1361      unsigned PredReg = 0;
1362      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
1363        continue;
1364
1365      int Opcode = MI->getOpcode();
1366      bool isLd = Opcode == ARM::LDR ||
1367        Opcode == ARM::FLDS || Opcode == ARM::FLDD;
1368      unsigned Base = MI->getOperand(1).getReg();
1369      int Offset = getMemoryOpOffset(MI);
1370
1371      bool StopHere = false;
1372      if (isLd) {
1373        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1374          Base2LdsMap.find(Base);
1375        if (BI != Base2LdsMap.end()) {
1376          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1377            if (Offset == getMemoryOpOffset(BI->second[i])) {
1378              StopHere = true;
1379              break;
1380            }
1381          }
1382          if (!StopHere)
1383            BI->second.push_back(MI);
1384        } else {
1385          SmallVector<MachineInstr*, 4> MIs;
1386          MIs.push_back(MI);
1387          Base2LdsMap[Base] = MIs;
1388          LdBases.push_back(Base);
1389        }
1390      } else {
1391        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1392          Base2StsMap.find(Base);
1393        if (BI != Base2StsMap.end()) {
1394          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1395            if (Offset == getMemoryOpOffset(BI->second[i])) {
1396              StopHere = true;
1397              break;
1398            }
1399          }
1400          if (!StopHere)
1401            BI->second.push_back(MI);
1402        } else {
1403          SmallVector<MachineInstr*, 4> MIs;
1404          MIs.push_back(MI);
1405          Base2StsMap[Base] = MIs;
1406          StBases.push_back(Base);
1407        }
1408      }
1409
1410      if (StopHere) {
1411        // Found a duplicate (a base+offset combination that's seen earlier).
1412        // Backtrack.
1413        --Loc;
1414        break;
1415      }
1416    }
1417
1418    // Re-schedule loads.
1419    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1420      unsigned Base = LdBases[i];
1421      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
1422      if (Lds.size() > 1)
1423        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1424    }
1425
1426    // Re-schedule stores.
1427    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1428      unsigned Base = StBases[i];
1429      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
1430      if (Sts.size() > 1)
1431        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1432    }
1433
1434    if (MBBI != E) {
1435      Base2LdsMap.clear();
1436      Base2StsMap.clear();
1437      LdBases.clear();
1438      StBases.clear();
1439    }
1440  }
1441
1442  return RetVal;
1443}
1444
1445
1446/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1447/// optimization pass.
1448FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
1449  if (PreAlloc)
1450    return new ARMPreAllocLoadStoreOpt();
1451  return new ARMLoadStoreOpt();
1452}
1453