ARMLoadStoreOptimizer.cpp revision 3063aed8d29cf2418fc1c3022a3dd9c8de0e4922
1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file contains a pass that performs load / store related peephole 11// optimizations. This pass should be run after register allocation. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "arm-ldst-opt" 16#include "ARM.h" 17#include "ARMAddressingModes.h" 18#include "ARMBaseInstrInfo.h" 19#include "ARMMachineFunctionInfo.h" 20#include "ARMRegisterInfo.h" 21#include "llvm/DerivedTypes.h" 22#include "llvm/Function.h" 23#include "llvm/CodeGen/MachineBasicBlock.h" 24#include "llvm/CodeGen/MachineFunctionPass.h" 25#include "llvm/CodeGen/MachineInstr.h" 26#include "llvm/CodeGen/MachineInstrBuilder.h" 27#include "llvm/CodeGen/MachineRegisterInfo.h" 28#include "llvm/CodeGen/RegisterScavenging.h" 29#include "llvm/Target/TargetData.h" 30#include "llvm/Target/TargetInstrInfo.h" 31#include "llvm/Target/TargetMachine.h" 32#include "llvm/Target/TargetRegisterInfo.h" 33#include "llvm/Support/ErrorHandling.h" 34#include "llvm/ADT/DenseMap.h" 35#include "llvm/ADT/STLExtras.h" 36#include "llvm/ADT/SmallPtrSet.h" 37#include "llvm/ADT/SmallSet.h" 38#include "llvm/ADT/SmallVector.h" 39#include "llvm/ADT/Statistic.h" 40using namespace llvm; 41 42STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 43STATISTIC(NumSTMGened , "Number of stm instructions generated"); 44STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); 45STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); 46STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 47STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 48STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 49STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 50STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 51STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 52STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 53 54/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 55/// load / store instructions to form ldm / stm instructions. 56 57namespace { 58 struct ARMLoadStoreOpt : public MachineFunctionPass { 59 static char ID; 60 ARMLoadStoreOpt() : MachineFunctionPass(&ID) {} 61 62 const TargetInstrInfo *TII; 63 const TargetRegisterInfo *TRI; 64 ARMFunctionInfo *AFI; 65 RegScavenger *RS; 66 bool isThumb2; 67 68 virtual bool runOnMachineFunction(MachineFunction &Fn); 69 70 virtual const char *getPassName() const { 71 return "ARM load / store optimization pass"; 72 } 73 74 private: 75 struct MemOpQueueEntry { 76 int Offset; 77 unsigned Position; 78 MachineBasicBlock::iterator MBBI; 79 bool Merged; 80 MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) 81 : Offset(o), Position(p), MBBI(i), Merged(false) {} 82 }; 83 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 84 typedef MemOpQueue::iterator MemOpQueueIter; 85 86 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 87 int Offset, unsigned Base, bool BaseKill, int Opcode, 88 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 89 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); 90 void MergeOpsUpdate(MachineBasicBlock &MBB, 91 MachineBasicBlock::iterator MBBI, 92 int Offset, 93 unsigned Base, 94 bool BaseKill, 95 int Opcode, 96 ARMCC::CondCodes Pred, 97 unsigned PredReg, 98 unsigned Scratch, 99 DebugLoc dl, 100 MemOpQueue &MemOps, 101 unsigned memOpsFrom, 102 unsigned memOpsTo, 103 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 104 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 105 int Opcode, unsigned Size, 106 ARMCC::CondCodes Pred, unsigned PredReg, 107 unsigned Scratch, MemOpQueue &MemOps, 108 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 109 110 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 111 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 112 MachineBasicBlock::iterator &MBBI); 113 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 114 MachineBasicBlock::iterator MBBI, 115 const TargetInstrInfo *TII, 116 bool &Advance, 117 MachineBasicBlock::iterator &I); 118 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 119 MachineBasicBlock::iterator MBBI, 120 bool &Advance, 121 MachineBasicBlock::iterator &I); 122 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 123 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 124 }; 125 char ARMLoadStoreOpt::ID = 0; 126} 127 128static int getLoadStoreMultipleOpcode(int Opcode) { 129 switch (Opcode) { 130 case ARM::LDR: 131 NumLDMGened++; 132 return ARM::LDM; 133 case ARM::STR: 134 NumSTMGened++; 135 return ARM::STM; 136 case ARM::t2LDRi8: 137 case ARM::t2LDRi12: 138 NumLDMGened++; 139 return ARM::t2LDM; 140 case ARM::t2STRi8: 141 case ARM::t2STRi12: 142 NumSTMGened++; 143 return ARM::t2STM; 144 case ARM::VLDRS: 145 NumVLDMGened++; 146 return ARM::VLDMS; 147 case ARM::VSTRS: 148 NumVSTMGened++; 149 return ARM::VSTMS; 150 case ARM::VLDRD: 151 NumVLDMGened++; 152 return ARM::VLDMD; 153 case ARM::VSTRD: 154 NumVSTMGened++; 155 return ARM::VSTMD; 156 default: llvm_unreachable("Unhandled opcode!"); 157 } 158 return 0; 159} 160 161static bool isT2i32Load(unsigned Opc) { 162 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 163} 164 165static bool isi32Load(unsigned Opc) { 166 return Opc == ARM::LDR || isT2i32Load(Opc); 167} 168 169static bool isT2i32Store(unsigned Opc) { 170 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 171} 172 173static bool isi32Store(unsigned Opc) { 174 return Opc == ARM::STR || isT2i32Store(Opc); 175} 176 177/// MergeOps - Create and insert a LDM or STM with Base as base register and 178/// registers in Regs as the register operands that would be loaded / stored. 179/// It returns true if the transformation is done. 180bool 181ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 182 MachineBasicBlock::iterator MBBI, 183 int Offset, unsigned Base, bool BaseKill, 184 int Opcode, ARMCC::CondCodes Pred, 185 unsigned PredReg, unsigned Scratch, DebugLoc dl, 186 SmallVector<std::pair<unsigned, bool>, 8> &Regs) { 187 // Only a single register to load / store. Don't bother. 188 unsigned NumRegs = Regs.size(); 189 if (NumRegs <= 1) 190 return false; 191 192 ARM_AM::AMSubMode Mode = ARM_AM::ia; 193 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 194 if (isAM4 && Offset == 4) { 195 if (isThumb2) 196 // Thumb2 does not support ldmib / stmib. 197 return false; 198 Mode = ARM_AM::ib; 199 } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { 200 if (isThumb2) 201 // Thumb2 does not support ldmda / stmda. 202 return false; 203 Mode = ARM_AM::da; 204 } else if (isAM4 && Offset == -4 * (int)NumRegs) { 205 Mode = ARM_AM::db; 206 } else if (Offset != 0) { 207 // If starting offset isn't zero, insert a MI to materialize a new base. 208 // But only do so if it is cost effective, i.e. merging more than two 209 // loads / stores. 210 if (NumRegs <= 2) 211 return false; 212 213 unsigned NewBase; 214 if (isi32Load(Opcode)) 215 // If it is a load, then just use one of the destination register to 216 // use as the new base. 217 NewBase = Regs[NumRegs-1].first; 218 else { 219 // Use the scratch register to use as a new base. 220 NewBase = Scratch; 221 if (NewBase == 0) 222 return false; 223 } 224 int BaseOpc = !isThumb2 225 ? ARM::ADDri 226 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); 227 if (Offset < 0) { 228 BaseOpc = !isThumb2 229 ? ARM::SUBri 230 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); 231 Offset = - Offset; 232 } 233 int ImmedOffset = isThumb2 234 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); 235 if (ImmedOffset == -1) 236 // FIXME: Try t2ADDri12 or t2SUBri12? 237 return false; // Probably not worth it then. 238 239 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 240 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 241 .addImm(Pred).addReg(PredReg).addReg(0); 242 Base = NewBase; 243 BaseKill = true; // New base is always killed right its use. 244 } 245 246 bool isDPR = Opcode == ARM::VLDRD || Opcode == ARM::VSTRD; 247 bool isDef = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 248 Opcode = getLoadStoreMultipleOpcode(Opcode); 249 MachineInstrBuilder MIB = (isAM4) 250 ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 251 .addReg(Base, getKillRegState(BaseKill)) 252 .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) 253 : BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 254 .addReg(Base, getKillRegState(BaseKill)) 255 .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) 256 .addImm(Pred).addReg(PredReg); 257 MIB.addReg(0); // Add optional writeback (0 for now). 258 for (unsigned i = 0; i != NumRegs; ++i) 259 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 260 | getKillRegState(Regs[i].second)); 261 262 return true; 263} 264 265// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on 266// success. 267void ARMLoadStoreOpt:: 268MergeOpsUpdate(MachineBasicBlock &MBB, 269 MachineBasicBlock::iterator MBBI, 270 int Offset, 271 unsigned Base, 272 bool BaseKill, 273 int Opcode, 274 ARMCC::CondCodes Pred, 275 unsigned PredReg, 276 unsigned Scratch, 277 DebugLoc dl, 278 MemOpQueue &MemOps, 279 unsigned memOpsFrom, 280 unsigned memOpsTo, 281 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 282 // First calculate which of the registers should be killed by the merged 283 // instruction. 284 SmallVector<std::pair<unsigned, bool>, 8> Regs; 285 for (unsigned i = memOpsFrom; i < memOpsTo; ++i) { 286 const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); 287 Regs.push_back(std::make_pair(MO.getReg(), MO.isKill())); 288 } 289 290 if (!MergeOps(MBB, MBBI, Offset, Base, BaseKill, Opcode, 291 Pred, PredReg, Scratch, dl, Regs)) 292 return; 293 294 // Merge succeeded, update records. 295 Merges.push_back(prior(MBBI)); 296 for (unsigned i = memOpsFrom; i < memOpsTo; ++i) { 297 MBB.erase(MemOps[i].MBBI); 298 MemOps[i].Merged = true; 299 } 300} 301 302/// MergeLDR_STR - Merge a number of load / store instructions into one or more 303/// load / store multiple instructions. 304void 305ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 306 unsigned Base, int Opcode, unsigned Size, 307 ARMCC::CondCodes Pred, unsigned PredReg, 308 unsigned Scratch, MemOpQueue &MemOps, 309 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 310 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 311 int Offset = MemOps[SIndex].Offset; 312 int SOffset = Offset; 313 unsigned Pos = MemOps[SIndex].Position; 314 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 315 DebugLoc dl = Loc->getDebugLoc(); 316 unsigned PReg = Loc->getOperand(0).getReg(); 317 unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg); 318 319 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 320 int NewOffset = MemOps[i].Offset; 321 unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg(); 322 unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); 323 // AM4 - register numbers in ascending order. 324 // AM5 - consecutive register numbers in ascending order. 325 if (NewOffset == Offset + (int)Size && 326 ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { 327 Offset += Size; 328 PRegNum = RegNum; 329 } else { 330 // Can't merge this in. Try merge the earlier ones first. 331 MergeOpsUpdate(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg, 332 Scratch, dl, MemOps, SIndex, i, Merges); 333 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 334 MemOps, Merges); 335 return; 336 } 337 338 if (MemOps[i].Position > Pos) { 339 Pos = MemOps[i].Position; 340 Loc = MemOps[i].MBBI; 341 } 342 } 343 344 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; 345 MergeOpsUpdate(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg, 346 Scratch, dl, MemOps, SIndex, MemOps.size(), Merges); 347 return; 348} 349 350static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 351 unsigned Bytes, unsigned Limit, 352 ARMCC::CondCodes Pred, unsigned PredReg){ 353 unsigned MyPredReg = 0; 354 if (!MI) 355 return false; 356 if (MI->getOpcode() != ARM::t2SUBri && 357 MI->getOpcode() != ARM::t2SUBrSPi && 358 MI->getOpcode() != ARM::t2SUBrSPi12 && 359 MI->getOpcode() != ARM::tSUBspi && 360 MI->getOpcode() != ARM::SUBri) 361 return false; 362 363 // Make sure the offset fits in 8 bits. 364 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 365 return false; 366 367 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME 368 return (MI->getOperand(0).getReg() == Base && 369 MI->getOperand(1).getReg() == Base && 370 (MI->getOperand(2).getImm()*Scale) == Bytes && 371 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 372 MyPredReg == PredReg); 373} 374 375static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 376 unsigned Bytes, unsigned Limit, 377 ARMCC::CondCodes Pred, unsigned PredReg){ 378 unsigned MyPredReg = 0; 379 if (!MI) 380 return false; 381 if (MI->getOpcode() != ARM::t2ADDri && 382 MI->getOpcode() != ARM::t2ADDrSPi && 383 MI->getOpcode() != ARM::t2ADDrSPi12 && 384 MI->getOpcode() != ARM::tADDspi && 385 MI->getOpcode() != ARM::ADDri) 386 return false; 387 388 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 389 // Make sure the offset fits in 8 bits. 390 return false; 391 392 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME 393 return (MI->getOperand(0).getReg() == Base && 394 MI->getOperand(1).getReg() == Base && 395 (MI->getOperand(2).getImm()*Scale) == Bytes && 396 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 397 MyPredReg == PredReg); 398} 399 400static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 401 switch (MI->getOpcode()) { 402 default: return 0; 403 case ARM::LDR: 404 case ARM::STR: 405 case ARM::t2LDRi8: 406 case ARM::t2LDRi12: 407 case ARM::t2STRi8: 408 case ARM::t2STRi12: 409 case ARM::VLDRS: 410 case ARM::VSTRS: 411 return 4; 412 case ARM::VLDRD: 413 case ARM::VSTRD: 414 return 8; 415 case ARM::LDM: 416 case ARM::STM: 417 case ARM::t2LDM: 418 case ARM::t2STM: 419 return (MI->getNumOperands() - 5) * 4; 420 case ARM::VLDMS: 421 case ARM::VSTMS: 422 case ARM::VLDMD: 423 case ARM::VSTMD: 424 return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; 425 } 426} 427 428/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 429/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: 430/// 431/// stmia rn, <ra, rb, rc> 432/// rn := rn + 4 * 3; 433/// => 434/// stmia rn!, <ra, rb, rc> 435/// 436/// rn := rn - 4 * 3; 437/// ldmia rn, <ra, rb, rc> 438/// => 439/// ldmdb rn!, <ra, rb, rc> 440bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 441 MachineBasicBlock::iterator MBBI, 442 bool &Advance, 443 MachineBasicBlock::iterator &I) { 444 MachineInstr *MI = MBBI; 445 unsigned Base = MI->getOperand(0).getReg(); 446 unsigned Bytes = getLSMultipleTransferSize(MI); 447 unsigned PredReg = 0; 448 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 449 int Opcode = MI->getOpcode(); 450 bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM || 451 Opcode == ARM::STM || Opcode == ARM::t2STM; 452 453 if (isAM4) { 454 if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) 455 return false; 456 457 // Can't use the updating AM4 sub-mode if the base register is also a dest 458 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 459 for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { 460 if (MI->getOperand(i).getReg() == Base) 461 return false; 462 } 463 464 ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); 465 if (MBBI != MBB.begin()) { 466 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 467 if (Mode == ARM_AM::ia && 468 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 469 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); 470 MI->getOperand(4).setReg(Base); 471 MI->getOperand(4).setIsDef(); 472 MBB.erase(PrevMBBI); 473 return true; 474 } else if (Mode == ARM_AM::ib && 475 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 476 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); 477 MI->getOperand(4).setReg(Base); // WB to base 478 MI->getOperand(4).setIsDef(); 479 MBB.erase(PrevMBBI); 480 return true; 481 } 482 } 483 484 if (MBBI != MBB.end()) { 485 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 486 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 487 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 488 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 489 MI->getOperand(4).setReg(Base); // WB to base 490 MI->getOperand(4).setIsDef(); 491 if (NextMBBI == I) { 492 Advance = true; 493 ++I; 494 } 495 MBB.erase(NextMBBI); 496 return true; 497 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 498 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 499 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 500 MI->getOperand(4).setReg(Base); // WB to base 501 MI->getOperand(4).setIsDef(); 502 if (NextMBBI == I) { 503 Advance = true; 504 ++I; 505 } 506 MBB.erase(NextMBBI); 507 return true; 508 } 509 } 510 } else { 511 // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops. 512 if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm())) 513 return false; 514 515 ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); 516 unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); 517 if (MBBI != MBB.begin()) { 518 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 519 if (Mode == ARM_AM::ia && 520 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 521 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); 522 MI->getOperand(4).setReg(Base); // WB to base 523 MI->getOperand(4).setIsDef(); 524 MBB.erase(PrevMBBI); 525 return true; 526 } 527 } 528 529 if (MBBI != MBB.end()) { 530 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 531 if (Mode == ARM_AM::ia && 532 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 533 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); 534 MI->getOperand(4).setReg(Base); // WB to base 535 MI->getOperand(4).setIsDef(); 536 if (NextMBBI == I) { 537 Advance = true; 538 ++I; 539 } 540 MBB.erase(NextMBBI); 541 } 542 return true; 543 } 544 } 545 546 return false; 547} 548 549static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { 550 switch (Opc) { 551 case ARM::LDR: return ARM::LDR_PRE; 552 case ARM::STR: return ARM::STR_PRE; 553 case ARM::VLDRS: return ARM::VLDMS; 554 case ARM::VLDRD: return ARM::VLDMD; 555 case ARM::VSTRS: return ARM::VSTMS; 556 case ARM::VSTRD: return ARM::VSTMD; 557 case ARM::t2LDRi8: 558 case ARM::t2LDRi12: 559 return ARM::t2LDR_PRE; 560 case ARM::t2STRi8: 561 case ARM::t2STRi12: 562 return ARM::t2STR_PRE; 563 default: llvm_unreachable("Unhandled opcode!"); 564 } 565 return 0; 566} 567 568static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { 569 switch (Opc) { 570 case ARM::LDR: return ARM::LDR_POST; 571 case ARM::STR: return ARM::STR_POST; 572 case ARM::VLDRS: return ARM::VLDMS; 573 case ARM::VLDRD: return ARM::VLDMD; 574 case ARM::VSTRS: return ARM::VSTMS; 575 case ARM::VSTRD: return ARM::VSTMD; 576 case ARM::t2LDRi8: 577 case ARM::t2LDRi12: 578 return ARM::t2LDR_POST; 579 case ARM::t2STRi8: 580 case ARM::t2STRi12: 581 return ARM::t2STR_POST; 582 default: llvm_unreachable("Unhandled opcode!"); 583 } 584 return 0; 585} 586 587/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 588/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 589bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 590 MachineBasicBlock::iterator MBBI, 591 const TargetInstrInfo *TII, 592 bool &Advance, 593 MachineBasicBlock::iterator &I) { 594 MachineInstr *MI = MBBI; 595 unsigned Base = MI->getOperand(1).getReg(); 596 bool BaseKill = MI->getOperand(1).isKill(); 597 unsigned Bytes = getLSMultipleTransferSize(MI); 598 int Opcode = MI->getOpcode(); 599 DebugLoc dl = MI->getDebugLoc(); 600 bool isAM5 = Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || 601 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS; 602 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 603 if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) 604 return false; 605 else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 606 return false; 607 else if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) 608 if (MI->getOperand(2).getImm() != 0) 609 return false; 610 611 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 612 // Can't do the merge if the destination register is the same as the would-be 613 // writeback register. 614 if (isLd && MI->getOperand(0).getReg() == Base) 615 return false; 616 617 unsigned PredReg = 0; 618 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 619 bool DoMerge = false; 620 ARM_AM::AddrOpc AddSub = ARM_AM::add; 621 unsigned NewOpc = 0; 622 // AM2 - 12 bits, thumb2 - 8 bits. 623 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 624 if (MBBI != MBB.begin()) { 625 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 626 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 627 DoMerge = true; 628 AddSub = ARM_AM::sub; 629 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 630 } else if (!isAM5 && 631 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 632 DoMerge = true; 633 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 634 } 635 if (DoMerge) 636 MBB.erase(PrevMBBI); 637 } 638 639 if (!DoMerge && MBBI != MBB.end()) { 640 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 641 if (!isAM5 && 642 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 643 DoMerge = true; 644 AddSub = ARM_AM::sub; 645 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 646 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 647 DoMerge = true; 648 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 649 } 650 if (DoMerge) { 651 if (NextMBBI == I) { 652 Advance = true; 653 ++I; 654 } 655 MBB.erase(NextMBBI); 656 } 657 } 658 659 if (!DoMerge) 660 return false; 661 662 bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD; 663 unsigned Offset = 0; 664 if (isAM5) 665 Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) 666 ? ARM_AM::db 667 : ARM_AM::ia, true, (isDPR ? 2 : 1)); 668 else if (isAM2) 669 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 670 else 671 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 672 if (isLd) { 673 if (isAM5) 674 // VLDMS, VLDMD 675 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 676 .addReg(Base, getKillRegState(BaseKill)) 677 .addImm(Offset).addImm(Pred).addReg(PredReg) 678 .addReg(Base, getDefRegState(true)) // WB base register 679 .addReg(MI->getOperand(0).getReg(), RegState::Define); 680 else if (isAM2) 681 // LDR_PRE, LDR_POST, 682 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 683 .addReg(Base, RegState::Define) 684 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 685 else 686 // t2LDR_PRE, t2LDR_POST 687 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 688 .addReg(Base, RegState::Define) 689 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 690 } else { 691 MachineOperand &MO = MI->getOperand(0); 692 if (isAM5) 693 // VSTMS, VSTMD 694 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset) 695 .addImm(Pred).addReg(PredReg) 696 .addReg(Base, getDefRegState(true)) // WB base register 697 .addReg(MO.getReg(), getKillRegState(MO.isKill())); 698 else if (isAM2) 699 // STR_PRE, STR_POST 700 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 701 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 702 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 703 else 704 // t2STR_PRE, t2STR_POST 705 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 706 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 707 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 708 } 709 MBB.erase(MBBI); 710 711 return true; 712} 713 714/// isMemoryOp - Returns true if instruction is a memory operations (that this 715/// pass is capable of operating on). 716static bool isMemoryOp(const MachineInstr *MI) { 717 int Opcode = MI->getOpcode(); 718 switch (Opcode) { 719 default: break; 720 case ARM::LDR: 721 case ARM::STR: 722 return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; 723 case ARM::VLDRS: 724 case ARM::VSTRS: 725 return MI->getOperand(1).isReg(); 726 case ARM::VLDRD: 727 case ARM::VSTRD: 728 return MI->getOperand(1).isReg(); 729 case ARM::t2LDRi8: 730 case ARM::t2LDRi12: 731 case ARM::t2STRi8: 732 case ARM::t2STRi12: 733 return MI->getOperand(1).isReg(); 734 } 735 return false; 736} 737 738/// AdvanceRS - Advance register scavenger to just before the earliest memory 739/// op that is being merged. 740void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 741 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 742 unsigned Position = MemOps[0].Position; 743 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 744 if (MemOps[i].Position < Position) { 745 Position = MemOps[i].Position; 746 Loc = MemOps[i].MBBI; 747 } 748 } 749 750 if (Loc != MBB.begin()) 751 RS->forward(prior(Loc)); 752} 753 754static int getMemoryOpOffset(const MachineInstr *MI) { 755 int Opcode = MI->getOpcode(); 756 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 757 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 758 unsigned NumOperands = MI->getDesc().getNumOperands(); 759 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 760 761 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 762 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 763 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) 764 return OffField; 765 766 int Offset = isAM2 767 ? ARM_AM::getAM2Offset(OffField) 768 : (isAM3 ? ARM_AM::getAM3Offset(OffField) 769 : ARM_AM::getAM5Offset(OffField) * 4); 770 if (isAM2) { 771 if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) 772 Offset = -Offset; 773 } else if (isAM3) { 774 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 775 Offset = -Offset; 776 } else { 777 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 778 Offset = -Offset; 779 } 780 return Offset; 781} 782 783static void InsertLDR_STR(MachineBasicBlock &MBB, 784 MachineBasicBlock::iterator &MBBI, 785 int OffImm, bool isDef, 786 DebugLoc dl, unsigned NewOpc, 787 unsigned Reg, bool RegDeadKill, bool RegUndef, 788 unsigned BaseReg, bool BaseKill, bool BaseUndef, 789 unsigned OffReg, bool OffKill, bool OffUndef, 790 ARMCC::CondCodes Pred, unsigned PredReg, 791 const TargetInstrInfo *TII, bool isT2) { 792 int Offset = OffImm; 793 if (!isT2) { 794 if (OffImm < 0) 795 Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); 796 else 797 Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); 798 } 799 if (isDef) { 800 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 801 TII->get(NewOpc)) 802 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 803 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 804 if (!isT2) 805 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 806 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 807 } else { 808 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 809 TII->get(NewOpc)) 810 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) 811 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 812 if (!isT2) 813 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 814 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 815 } 816} 817 818bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 819 MachineBasicBlock::iterator &MBBI) { 820 MachineInstr *MI = &*MBBI; 821 unsigned Opcode = MI->getOpcode(); 822 if (Opcode == ARM::LDRD || Opcode == ARM::STRD || 823 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { 824 unsigned EvenReg = MI->getOperand(0).getReg(); 825 unsigned OddReg = MI->getOperand(1).getReg(); 826 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 827 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 828 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) 829 return false; 830 831 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; 832 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; 833 bool EvenDeadKill = isLd ? 834 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 835 bool EvenUndef = MI->getOperand(0).isUndef(); 836 bool OddDeadKill = isLd ? 837 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 838 bool OddUndef = MI->getOperand(1).isUndef(); 839 const MachineOperand &BaseOp = MI->getOperand(2); 840 unsigned BaseReg = BaseOp.getReg(); 841 bool BaseKill = BaseOp.isKill(); 842 bool BaseUndef = BaseOp.isUndef(); 843 unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); 844 bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); 845 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); 846 int OffImm = getMemoryOpOffset(MI); 847 unsigned PredReg = 0; 848 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 849 850 if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { 851 // Ascending register numbers and no offset. It's safe to change it to a 852 // ldm or stm. 853 unsigned NewOpc = (isLd) 854 ? (isT2 ? ARM::t2LDM : ARM::LDM) 855 : (isT2 ? ARM::t2STM : ARM::STM); 856 if (isLd) { 857 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 858 .addReg(BaseReg, getKillRegState(BaseKill)) 859 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 860 .addImm(Pred).addReg(PredReg) 861 .addReg(0) 862 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 863 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 864 ++NumLDRD2LDM; 865 } else { 866 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 867 .addReg(BaseReg, getKillRegState(BaseKill)) 868 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 869 .addImm(Pred).addReg(PredReg) 870 .addReg(0) 871 .addReg(EvenReg, 872 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) 873 .addReg(OddReg, 874 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); 875 ++NumSTRD2STM; 876 } 877 } else { 878 // Split into two instructions. 879 assert((!isT2 || !OffReg) && 880 "Thumb2 ldrd / strd does not encode offset register!"); 881 unsigned NewOpc = (isLd) 882 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) 883 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); 884 DebugLoc dl = MBBI->getDebugLoc(); 885 // If this is a load and base register is killed, it may have been 886 // re-defed by the load, make sure the first load does not clobber it. 887 if (isLd && 888 (BaseKill || OffKill) && 889 (TRI->regsOverlap(EvenReg, BaseReg) || 890 (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { 891 assert(!TRI->regsOverlap(OddReg, BaseReg) && 892 (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); 893 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 894 OddReg, OddDeadKill, false, 895 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 896 Pred, PredReg, TII, isT2); 897 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 898 EvenReg, EvenDeadKill, false, 899 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 900 Pred, PredReg, TII, isT2); 901 } else { 902 if (OddReg == EvenReg && EvenDeadKill) { 903 // If the two source operands are the same, the kill marker is probably 904 // on the first one. e.g. 905 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 906 EvenDeadKill = false; 907 OddDeadKill = true; 908 } 909 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 910 EvenReg, EvenDeadKill, EvenUndef, 911 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 912 Pred, PredReg, TII, isT2); 913 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 914 OddReg, OddDeadKill, OddUndef, 915 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 916 Pred, PredReg, TII, isT2); 917 } 918 if (isLd) 919 ++NumLDRD2LDR; 920 else 921 ++NumSTRD2STR; 922 } 923 924 MBBI = prior(MBBI); 925 MBB.erase(MI); 926 } 927 return false; 928} 929 930/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 931/// ops of the same base and incrementing offset into LDM / STM ops. 932bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 933 unsigned NumMerges = 0; 934 unsigned NumMemOps = 0; 935 MemOpQueue MemOps; 936 unsigned CurrBase = 0; 937 int CurrOpc = -1; 938 unsigned CurrSize = 0; 939 ARMCC::CondCodes CurrPred = ARMCC::AL; 940 unsigned CurrPredReg = 0; 941 unsigned Position = 0; 942 SmallVector<MachineBasicBlock::iterator,4> Merges; 943 944 RS->enterBasicBlock(&MBB); 945 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 946 while (MBBI != E) { 947 if (FixInvalidRegPairOp(MBB, MBBI)) 948 continue; 949 950 bool Advance = false; 951 bool TryMerge = false; 952 bool Clobber = false; 953 954 bool isMemOp = isMemoryOp(MBBI); 955 if (isMemOp) { 956 int Opcode = MBBI->getOpcode(); 957 unsigned Size = getLSMultipleTransferSize(MBBI); 958 unsigned Base = MBBI->getOperand(1).getReg(); 959 unsigned PredReg = 0; 960 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); 961 int Offset = getMemoryOpOffset(MBBI); 962 // Watch out for: 963 // r4 := ldr [r5] 964 // r5 := ldr [r5, #4] 965 // r6 := ldr [r5, #8] 966 // 967 // The second ldr has effectively broken the chain even though it 968 // looks like the later ldr(s) use the same base register. Try to 969 // merge the ldr's so far, including this one. But don't try to 970 // combine the following ldr(s). 971 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 972 if (CurrBase == 0 && !Clobber) { 973 // Start of a new chain. 974 CurrBase = Base; 975 CurrOpc = Opcode; 976 CurrSize = Size; 977 CurrPred = Pred; 978 CurrPredReg = PredReg; 979 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 980 NumMemOps++; 981 Advance = true; 982 } else { 983 if (Clobber) { 984 TryMerge = true; 985 Advance = true; 986 } 987 988 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 989 // No need to match PredReg. 990 // Continue adding to the queue. 991 if (Offset > MemOps.back().Offset) { 992 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 993 NumMemOps++; 994 Advance = true; 995 } else { 996 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 997 I != E; ++I) { 998 if (Offset < I->Offset) { 999 MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); 1000 NumMemOps++; 1001 Advance = true; 1002 break; 1003 } else if (Offset == I->Offset) { 1004 // Collision! This can't be merged! 1005 break; 1006 } 1007 } 1008 } 1009 } 1010 } 1011 } 1012 1013 if (Advance) { 1014 ++Position; 1015 ++MBBI; 1016 if (MBBI == E) 1017 // Reach the end of the block, try merging the memory instructions. 1018 TryMerge = true; 1019 } else 1020 TryMerge = true; 1021 1022 if (TryMerge) { 1023 if (NumMemOps > 1) { 1024 // Try to find a free register to use as a new base in case it's needed. 1025 // First advance to the instruction just before the start of the chain. 1026 AdvanceRS(MBB, MemOps); 1027 // Find a scratch register. 1028 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); 1029 // Process the load / store instructions. 1030 RS->forward(prior(MBBI)); 1031 1032 // Merge ops. 1033 Merges.clear(); 1034 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 1035 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 1036 1037 // Try folding preceeding/trailing base inc/dec into the generated 1038 // LDM/STM ops. 1039 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 1040 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 1041 ++NumMerges; 1042 NumMerges += Merges.size(); 1043 1044 // Try folding preceeding/trailing base inc/dec into those load/store 1045 // that were not merged to form LDM/STM ops. 1046 for (unsigned i = 0; i != NumMemOps; ++i) 1047 if (!MemOps[i].Merged) 1048 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 1049 ++NumMerges; 1050 1051 // RS may be pointing to an instruction that's deleted. 1052 RS->skipTo(prior(MBBI)); 1053 } else if (NumMemOps == 1) { 1054 // Try folding preceeding/trailing base inc/dec into the single 1055 // load/store. 1056 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 1057 ++NumMerges; 1058 RS->forward(prior(MBBI)); 1059 } 1060 } 1061 1062 CurrBase = 0; 1063 CurrOpc = -1; 1064 CurrSize = 0; 1065 CurrPred = ARMCC::AL; 1066 CurrPredReg = 0; 1067 if (NumMemOps) { 1068 MemOps.clear(); 1069 NumMemOps = 0; 1070 } 1071 1072 // If iterator hasn't been advanced and this is not a memory op, skip it. 1073 // It can't start a new chain anyway. 1074 if (!Advance && !isMemOp && MBBI != E) { 1075 ++Position; 1076 ++MBBI; 1077 } 1078 } 1079 } 1080 return NumMerges > 0; 1081} 1082 1083namespace { 1084 struct OffsetCompare { 1085 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { 1086 int LOffset = getMemoryOpOffset(LHS); 1087 int ROffset = getMemoryOpOffset(RHS); 1088 assert(LHS == RHS || LOffset != ROffset); 1089 return LOffset > ROffset; 1090 } 1091 }; 1092} 1093 1094/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op 1095/// (bx lr) into the preceeding stack restore so it directly restore the value 1096/// of LR into pc. 1097/// ldmfd sp!, {r7, lr} 1098/// bx lr 1099/// => 1100/// ldmfd sp!, {r7, pc} 1101bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1102 if (MBB.empty()) return false; 1103 1104 MachineBasicBlock::iterator MBBI = prior(MBB.end()); 1105 if (MBBI != MBB.begin() && 1106 (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) { 1107 MachineInstr *PrevMI = prior(MBBI); 1108 if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) { 1109 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1110 if (MO.getReg() != ARM::LR) 1111 return false; 1112 unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; 1113 PrevMI->setDesc(TII->get(NewOpc)); 1114 MO.setReg(ARM::PC); 1115 MBB.erase(MBBI); 1116 return true; 1117 } 1118 } 1119 return false; 1120} 1121 1122bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1123 const TargetMachine &TM = Fn.getTarget(); 1124 AFI = Fn.getInfo<ARMFunctionInfo>(); 1125 TII = TM.getInstrInfo(); 1126 TRI = TM.getRegisterInfo(); 1127 RS = new RegScavenger(); 1128 isThumb2 = AFI->isThumb2Function(); 1129 1130 bool Modified = false; 1131 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1132 ++MFI) { 1133 MachineBasicBlock &MBB = *MFI; 1134 Modified |= LoadStoreMultipleOpti(MBB); 1135 Modified |= MergeReturnIntoLDM(MBB); 1136 } 1137 1138 delete RS; 1139 return Modified; 1140} 1141 1142 1143/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1144/// load / stores from consecutive locations close to make it more 1145/// likely they will be combined later. 1146 1147namespace { 1148 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1149 static char ID; 1150 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {} 1151 1152 const TargetData *TD; 1153 const TargetInstrInfo *TII; 1154 const TargetRegisterInfo *TRI; 1155 const ARMSubtarget *STI; 1156 MachineRegisterInfo *MRI; 1157 MachineFunction *MF; 1158 1159 virtual bool runOnMachineFunction(MachineFunction &Fn); 1160 1161 virtual const char *getPassName() const { 1162 return "ARM pre- register allocation load / store optimization pass"; 1163 } 1164 1165 private: 1166 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1167 unsigned &NewOpc, unsigned &EvenReg, 1168 unsigned &OddReg, unsigned &BaseReg, 1169 unsigned &OffReg, int &Offset, 1170 unsigned &PredReg, ARMCC::CondCodes &Pred, 1171 bool &isT2); 1172 bool RescheduleOps(MachineBasicBlock *MBB, 1173 SmallVector<MachineInstr*, 4> &Ops, 1174 unsigned Base, bool isLd, 1175 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1176 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1177 }; 1178 char ARMPreAllocLoadStoreOpt::ID = 0; 1179} 1180 1181bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1182 TD = Fn.getTarget().getTargetData(); 1183 TII = Fn.getTarget().getInstrInfo(); 1184 TRI = Fn.getTarget().getRegisterInfo(); 1185 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1186 MRI = &Fn.getRegInfo(); 1187 MF = &Fn; 1188 1189 bool Modified = false; 1190 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1191 ++MFI) 1192 Modified |= RescheduleLoadStoreInstrs(MFI); 1193 1194 return Modified; 1195} 1196 1197static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1198 MachineBasicBlock::iterator I, 1199 MachineBasicBlock::iterator E, 1200 SmallPtrSet<MachineInstr*, 4> &MemOps, 1201 SmallSet<unsigned, 4> &MemRegs, 1202 const TargetRegisterInfo *TRI) { 1203 // Are there stores / loads / calls between them? 1204 // FIXME: This is overly conservative. We should make use of alias information 1205 // some day. 1206 SmallSet<unsigned, 4> AddedRegPressure; 1207 while (++I != E) { 1208 if (MemOps.count(&*I)) 1209 continue; 1210 const TargetInstrDesc &TID = I->getDesc(); 1211 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) 1212 return false; 1213 if (isLd && TID.mayStore()) 1214 return false; 1215 if (!isLd) { 1216 if (TID.mayLoad()) 1217 return false; 1218 // It's not safe to move the first 'str' down. 1219 // str r1, [r0] 1220 // strh r5, [r0] 1221 // str r4, [r0, #+4] 1222 if (TID.mayStore()) 1223 return false; 1224 } 1225 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1226 MachineOperand &MO = I->getOperand(j); 1227 if (!MO.isReg()) 1228 continue; 1229 unsigned Reg = MO.getReg(); 1230 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1231 return false; 1232 if (Reg != Base && !MemRegs.count(Reg)) 1233 AddedRegPressure.insert(Reg); 1234 } 1235 } 1236 1237 // Estimate register pressure increase due to the transformation. 1238 if (MemRegs.size() <= 4) 1239 // Ok if we are moving small number of instructions. 1240 return true; 1241 return AddedRegPressure.size() <= MemRegs.size() * 2; 1242} 1243 1244bool 1245ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1246 DebugLoc &dl, 1247 unsigned &NewOpc, unsigned &EvenReg, 1248 unsigned &OddReg, unsigned &BaseReg, 1249 unsigned &OffReg, int &Offset, 1250 unsigned &PredReg, 1251 ARMCC::CondCodes &Pred, 1252 bool &isT2) { 1253 // Make sure we're allowed to generate LDRD/STRD. 1254 if (!STI->hasV5TEOps()) 1255 return false; 1256 1257 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD 1258 unsigned Scale = 1; 1259 unsigned Opcode = Op0->getOpcode(); 1260 if (Opcode == ARM::LDR) 1261 NewOpc = ARM::LDRD; 1262 else if (Opcode == ARM::STR) 1263 NewOpc = ARM::STRD; 1264 else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { 1265 NewOpc = ARM::t2LDRDi8; 1266 Scale = 4; 1267 isT2 = true; 1268 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { 1269 NewOpc = ARM::t2STRDi8; 1270 Scale = 4; 1271 isT2 = true; 1272 } else 1273 return false; 1274 1275 // Make sure the offset registers match. 1276 if (!isT2 && 1277 (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) 1278 return false; 1279 1280 // Must sure the base address satisfies i64 ld / st alignment requirement. 1281 if (!Op0->hasOneMemOperand() || 1282 !(*Op0->memoperands_begin())->getValue() || 1283 (*Op0->memoperands_begin())->isVolatile()) 1284 return false; 1285 1286 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1287 Function *Func = MF->getFunction(); 1288 unsigned ReqAlign = STI->hasV6Ops() 1289 ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 1290 : 8; // Pre-v6 need 8-byte align 1291 if (Align < ReqAlign) 1292 return false; 1293 1294 // Then make sure the immediate offset fits. 1295 int OffImm = getMemoryOpOffset(Op0); 1296 if (isT2) { 1297 if (OffImm < 0) { 1298 if (OffImm < -255) 1299 // Can't fall back to t2LDRi8 / t2STRi8. 1300 return false; 1301 } else { 1302 int Limit = (1 << 8) * Scale; 1303 if (OffImm >= Limit || (OffImm & (Scale-1))) 1304 return false; 1305 } 1306 Offset = OffImm; 1307 } else { 1308 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1309 if (OffImm < 0) { 1310 AddSub = ARM_AM::sub; 1311 OffImm = - OffImm; 1312 } 1313 int Limit = (1 << 8) * Scale; 1314 if (OffImm >= Limit || (OffImm & (Scale-1))) 1315 return false; 1316 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1317 } 1318 EvenReg = Op0->getOperand(0).getReg(); 1319 OddReg = Op1->getOperand(0).getReg(); 1320 if (EvenReg == OddReg) 1321 return false; 1322 BaseReg = Op0->getOperand(1).getReg(); 1323 if (!isT2) 1324 OffReg = Op0->getOperand(2).getReg(); 1325 Pred = llvm::getInstrPredicate(Op0, PredReg); 1326 dl = Op0->getDebugLoc(); 1327 return true; 1328} 1329 1330bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1331 SmallVector<MachineInstr*, 4> &Ops, 1332 unsigned Base, bool isLd, 1333 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1334 bool RetVal = false; 1335 1336 // Sort by offset (in reverse order). 1337 std::sort(Ops.begin(), Ops.end(), OffsetCompare()); 1338 1339 // The loads / stores of the same base are in order. Scan them from first to 1340 // last and check for the followins: 1341 // 1. Any def of base. 1342 // 2. Any gaps. 1343 while (Ops.size() > 1) { 1344 unsigned FirstLoc = ~0U; 1345 unsigned LastLoc = 0; 1346 MachineInstr *FirstOp = 0; 1347 MachineInstr *LastOp = 0; 1348 int LastOffset = 0; 1349 unsigned LastOpcode = 0; 1350 unsigned LastBytes = 0; 1351 unsigned NumMove = 0; 1352 for (int i = Ops.size() - 1; i >= 0; --i) { 1353 MachineInstr *Op = Ops[i]; 1354 unsigned Loc = MI2LocMap[Op]; 1355 if (Loc <= FirstLoc) { 1356 FirstLoc = Loc; 1357 FirstOp = Op; 1358 } 1359 if (Loc >= LastLoc) { 1360 LastLoc = Loc; 1361 LastOp = Op; 1362 } 1363 1364 unsigned Opcode = Op->getOpcode(); 1365 if (LastOpcode && Opcode != LastOpcode) 1366 break; 1367 1368 int Offset = getMemoryOpOffset(Op); 1369 unsigned Bytes = getLSMultipleTransferSize(Op); 1370 if (LastBytes) { 1371 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1372 break; 1373 } 1374 LastOffset = Offset; 1375 LastBytes = Bytes; 1376 LastOpcode = Opcode; 1377 if (++NumMove == 8) // FIXME: Tune this limit. 1378 break; 1379 } 1380 1381 if (NumMove <= 1) 1382 Ops.pop_back(); 1383 else { 1384 SmallPtrSet<MachineInstr*, 4> MemOps; 1385 SmallSet<unsigned, 4> MemRegs; 1386 for (int i = NumMove-1; i >= 0; --i) { 1387 MemOps.insert(Ops[i]); 1388 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1389 } 1390 1391 // Be conservative, if the instructions are too far apart, don't 1392 // move them. We want to limit the increase of register pressure. 1393 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1394 if (DoMove) 1395 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1396 MemOps, MemRegs, TRI); 1397 if (!DoMove) { 1398 for (unsigned i = 0; i != NumMove; ++i) 1399 Ops.pop_back(); 1400 } else { 1401 // This is the new location for the loads / stores. 1402 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1403 while (InsertPos != MBB->end() && MemOps.count(InsertPos)) 1404 ++InsertPos; 1405 1406 // If we are moving a pair of loads / stores, see if it makes sense 1407 // to try to allocate a pair of registers that can form register pairs. 1408 MachineInstr *Op0 = Ops.back(); 1409 MachineInstr *Op1 = Ops[Ops.size()-2]; 1410 unsigned EvenReg = 0, OddReg = 0; 1411 unsigned BaseReg = 0, OffReg = 0, PredReg = 0; 1412 ARMCC::CondCodes Pred = ARMCC::AL; 1413 bool isT2 = false; 1414 unsigned NewOpc = 0; 1415 int Offset = 0; 1416 DebugLoc dl; 1417 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1418 EvenReg, OddReg, BaseReg, OffReg, 1419 Offset, PredReg, Pred, isT2)) { 1420 Ops.pop_back(); 1421 Ops.pop_back(); 1422 1423 // Form the pair instruction. 1424 if (isLd) { 1425 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1426 dl, TII->get(NewOpc)) 1427 .addReg(EvenReg, RegState::Define) 1428 .addReg(OddReg, RegState::Define) 1429 .addReg(BaseReg); 1430 if (!isT2) 1431 MIB.addReg(OffReg); 1432 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1433 ++NumLDRDFormed; 1434 } else { 1435 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1436 dl, TII->get(NewOpc)) 1437 .addReg(EvenReg) 1438 .addReg(OddReg) 1439 .addReg(BaseReg); 1440 if (!isT2) 1441 MIB.addReg(OffReg); 1442 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1443 ++NumSTRDFormed; 1444 } 1445 MBB->erase(Op0); 1446 MBB->erase(Op1); 1447 1448 // Add register allocation hints to form register pairs. 1449 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 1450 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 1451 } else { 1452 for (unsigned i = 0; i != NumMove; ++i) { 1453 MachineInstr *Op = Ops.back(); 1454 Ops.pop_back(); 1455 MBB->splice(InsertPos, MBB, Op); 1456 } 1457 } 1458 1459 NumLdStMoved += NumMove; 1460 RetVal = true; 1461 } 1462 } 1463 } 1464 1465 return RetVal; 1466} 1467 1468bool 1469ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 1470 bool RetVal = false; 1471 1472 DenseMap<MachineInstr*, unsigned> MI2LocMap; 1473 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 1474 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 1475 SmallVector<unsigned, 4> LdBases; 1476 SmallVector<unsigned, 4> StBases; 1477 1478 unsigned Loc = 0; 1479 MachineBasicBlock::iterator MBBI = MBB->begin(); 1480 MachineBasicBlock::iterator E = MBB->end(); 1481 while (MBBI != E) { 1482 for (; MBBI != E; ++MBBI) { 1483 MachineInstr *MI = MBBI; 1484 const TargetInstrDesc &TID = MI->getDesc(); 1485 if (TID.isCall() || TID.isTerminator()) { 1486 // Stop at barriers. 1487 ++MBBI; 1488 break; 1489 } 1490 1491 MI2LocMap[MI] = Loc++; 1492 if (!isMemoryOp(MI)) 1493 continue; 1494 unsigned PredReg = 0; 1495 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) 1496 continue; 1497 1498 int Opc = MI->getOpcode(); 1499 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; 1500 unsigned Base = MI->getOperand(1).getReg(); 1501 int Offset = getMemoryOpOffset(MI); 1502 1503 bool StopHere = false; 1504 if (isLd) { 1505 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1506 Base2LdsMap.find(Base); 1507 if (BI != Base2LdsMap.end()) { 1508 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1509 if (Offset == getMemoryOpOffset(BI->second[i])) { 1510 StopHere = true; 1511 break; 1512 } 1513 } 1514 if (!StopHere) 1515 BI->second.push_back(MI); 1516 } else { 1517 SmallVector<MachineInstr*, 4> MIs; 1518 MIs.push_back(MI); 1519 Base2LdsMap[Base] = MIs; 1520 LdBases.push_back(Base); 1521 } 1522 } else { 1523 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1524 Base2StsMap.find(Base); 1525 if (BI != Base2StsMap.end()) { 1526 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1527 if (Offset == getMemoryOpOffset(BI->second[i])) { 1528 StopHere = true; 1529 break; 1530 } 1531 } 1532 if (!StopHere) 1533 BI->second.push_back(MI); 1534 } else { 1535 SmallVector<MachineInstr*, 4> MIs; 1536 MIs.push_back(MI); 1537 Base2StsMap[Base] = MIs; 1538 StBases.push_back(Base); 1539 } 1540 } 1541 1542 if (StopHere) { 1543 // Found a duplicate (a base+offset combination that's seen earlier). 1544 // Backtrack. 1545 --Loc; 1546 break; 1547 } 1548 } 1549 1550 // Re-schedule loads. 1551 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 1552 unsigned Base = LdBases[i]; 1553 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; 1554 if (Lds.size() > 1) 1555 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 1556 } 1557 1558 // Re-schedule stores. 1559 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 1560 unsigned Base = StBases[i]; 1561 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; 1562 if (Sts.size() > 1) 1563 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 1564 } 1565 1566 if (MBBI != E) { 1567 Base2LdsMap.clear(); 1568 Base2StsMap.clear(); 1569 LdBases.clear(); 1570 StBases.clear(); 1571 } 1572 } 1573 1574 return RetVal; 1575} 1576 1577 1578/// createARMLoadStoreOptimizationPass - returns an instance of the load / store 1579/// optimization pass. 1580FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 1581 if (PreAlloc) 1582 return new ARMPreAllocLoadStoreOpt(); 1583 return new ARMLoadStoreOpt(); 1584} 1585