ARMLoadStoreOptimizer.cpp revision c76909abfec876c6b751d693ebd3df07df686aa0
1//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file contains a pass that performs load / store related peephole 11// optimizations. This pass should be run after register allocation. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "arm-ldst-opt" 16#include "ARM.h" 17#include "ARMAddressingModes.h" 18#include "ARMBaseInstrInfo.h" 19#include "ARMMachineFunctionInfo.h" 20#include "ARMRegisterInfo.h" 21#include "llvm/DerivedTypes.h" 22#include "llvm/Function.h" 23#include "llvm/CodeGen/MachineBasicBlock.h" 24#include "llvm/CodeGen/MachineFunctionPass.h" 25#include "llvm/CodeGen/MachineInstr.h" 26#include "llvm/CodeGen/MachineInstrBuilder.h" 27#include "llvm/CodeGen/MachineRegisterInfo.h" 28#include "llvm/CodeGen/RegisterScavenging.h" 29#include "llvm/Target/TargetData.h" 30#include "llvm/Target/TargetInstrInfo.h" 31#include "llvm/Target/TargetMachine.h" 32#include "llvm/Target/TargetRegisterInfo.h" 33#include "llvm/Support/Compiler.h" 34#include "llvm/Support/ErrorHandling.h" 35#include "llvm/ADT/DenseMap.h" 36#include "llvm/ADT/STLExtras.h" 37#include "llvm/ADT/SmallPtrSet.h" 38#include "llvm/ADT/SmallSet.h" 39#include "llvm/ADT/SmallVector.h" 40#include "llvm/ADT/Statistic.h" 41using namespace llvm; 42 43STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 44STATISTIC(NumSTMGened , "Number of stm instructions generated"); 45STATISTIC(NumFLDMGened, "Number of fldm instructions generated"); 46STATISTIC(NumFSTMGened, "Number of fstm instructions generated"); 47STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 48STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 49STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 50STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 51STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 52STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 53STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 54 55/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 56/// load / store instructions to form ldm / stm instructions. 57 58namespace { 59 struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass { 60 static char ID; 61 ARMLoadStoreOpt() : MachineFunctionPass(&ID) {} 62 63 const TargetInstrInfo *TII; 64 const TargetRegisterInfo *TRI; 65 ARMFunctionInfo *AFI; 66 RegScavenger *RS; 67 bool isThumb2; 68 69 virtual bool runOnMachineFunction(MachineFunction &Fn); 70 71 virtual const char *getPassName() const { 72 return "ARM load / store optimization pass"; 73 } 74 75 private: 76 struct MemOpQueueEntry { 77 int Offset; 78 unsigned Position; 79 MachineBasicBlock::iterator MBBI; 80 bool Merged; 81 MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) 82 : Offset(o), Position(p), MBBI(i), Merged(false) {}; 83 }; 84 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 85 typedef MemOpQueue::iterator MemOpQueueIter; 86 87 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 88 int Offset, unsigned Base, bool BaseKill, int Opcode, 89 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 90 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); 91 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 92 int Opcode, unsigned Size, 93 ARMCC::CondCodes Pred, unsigned PredReg, 94 unsigned Scratch, MemOpQueue &MemOps, 95 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 96 97 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 98 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 99 MachineBasicBlock::iterator &MBBI); 100 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 101 MachineBasicBlock::iterator MBBI, 102 const TargetInstrInfo *TII, 103 bool &Advance, 104 MachineBasicBlock::iterator &I); 105 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 106 MachineBasicBlock::iterator MBBI, 107 bool &Advance, 108 MachineBasicBlock::iterator &I); 109 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 110 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 111 }; 112 char ARMLoadStoreOpt::ID = 0; 113} 114 115static int getLoadStoreMultipleOpcode(int Opcode) { 116 switch (Opcode) { 117 case ARM::LDR: 118 NumLDMGened++; 119 return ARM::LDM; 120 case ARM::STR: 121 NumSTMGened++; 122 return ARM::STM; 123 case ARM::t2LDRi8: 124 case ARM::t2LDRi12: 125 NumLDMGened++; 126 return ARM::t2LDM; 127 case ARM::t2STRi8: 128 case ARM::t2STRi12: 129 NumSTMGened++; 130 return ARM::t2STM; 131 case ARM::FLDS: 132 NumFLDMGened++; 133 return ARM::FLDMS; 134 case ARM::FSTS: 135 NumFSTMGened++; 136 return ARM::FSTMS; 137 case ARM::FLDD: 138 NumFLDMGened++; 139 return ARM::FLDMD; 140 case ARM::FSTD: 141 NumFSTMGened++; 142 return ARM::FSTMD; 143 default: llvm_unreachable("Unhandled opcode!"); 144 } 145 return 0; 146} 147 148static bool isT2i32Load(unsigned Opc) { 149 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 150} 151 152static bool isi32Load(unsigned Opc) { 153 return Opc == ARM::LDR || isT2i32Load(Opc); 154} 155 156static bool isT2i32Store(unsigned Opc) { 157 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 158} 159 160static bool isi32Store(unsigned Opc) { 161 return Opc == ARM::STR || isT2i32Store(Opc); 162} 163 164/// MergeOps - Create and insert a LDM or STM with Base as base register and 165/// registers in Regs as the register operands that would be loaded / stored. 166/// It returns true if the transformation is done. 167bool 168ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 169 MachineBasicBlock::iterator MBBI, 170 int Offset, unsigned Base, bool BaseKill, 171 int Opcode, ARMCC::CondCodes Pred, 172 unsigned PredReg, unsigned Scratch, DebugLoc dl, 173 SmallVector<std::pair<unsigned, bool>, 8> &Regs) { 174 // Only a single register to load / store. Don't bother. 175 unsigned NumRegs = Regs.size(); 176 if (NumRegs <= 1) 177 return false; 178 179 ARM_AM::AMSubMode Mode = ARM_AM::ia; 180 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 181 if (isAM4 && Offset == 4) { 182 if (isThumb2) 183 // Thumb2 does not support ldmib / stmib. 184 return false; 185 Mode = ARM_AM::ib; 186 } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { 187 if (isThumb2) 188 // Thumb2 does not support ldmda / stmda. 189 return false; 190 Mode = ARM_AM::da; 191 } else if (isAM4 && Offset == -4 * (int)NumRegs) { 192 Mode = ARM_AM::db; 193 } else if (Offset != 0) { 194 // If starting offset isn't zero, insert a MI to materialize a new base. 195 // But only do so if it is cost effective, i.e. merging more than two 196 // loads / stores. 197 if (NumRegs <= 2) 198 return false; 199 200 unsigned NewBase; 201 if (isi32Load(Opcode)) 202 // If it is a load, then just use one of the destination register to 203 // use as the new base. 204 NewBase = Regs[NumRegs-1].first; 205 else { 206 // Use the scratch register to use as a new base. 207 NewBase = Scratch; 208 if (NewBase == 0) 209 return false; 210 } 211 int BaseOpc = !isThumb2 212 ? ARM::ADDri 213 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); 214 if (Offset < 0) { 215 BaseOpc = !isThumb2 216 ? ARM::SUBri 217 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); 218 Offset = - Offset; 219 } 220 int ImmedOffset = isThumb2 221 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); 222 if (ImmedOffset == -1) 223 // FIXME: Try t2ADDri12 or t2SUBri12? 224 return false; // Probably not worth it then. 225 226 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 227 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 228 .addImm(Pred).addReg(PredReg).addReg(0); 229 Base = NewBase; 230 BaseKill = true; // New base is always killed right its use. 231 } 232 233 bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD; 234 bool isDef = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD; 235 Opcode = getLoadStoreMultipleOpcode(Opcode); 236 MachineInstrBuilder MIB = (isAM4) 237 ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 238 .addReg(Base, getKillRegState(BaseKill)) 239 .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) 240 : BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 241 .addReg(Base, getKillRegState(BaseKill)) 242 .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) 243 .addImm(Pred).addReg(PredReg); 244 for (unsigned i = 0; i != NumRegs; ++i) 245 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 246 | getKillRegState(Regs[i].second)); 247 248 return true; 249} 250 251/// MergeLDR_STR - Merge a number of load / store instructions into one or more 252/// load / store multiple instructions. 253void 254ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 255 unsigned Base, int Opcode, unsigned Size, 256 ARMCC::CondCodes Pred, unsigned PredReg, 257 unsigned Scratch, MemOpQueue &MemOps, 258 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 259 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 260 int Offset = MemOps[SIndex].Offset; 261 int SOffset = Offset; 262 unsigned Pos = MemOps[SIndex].Position; 263 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 264 DebugLoc dl = Loc->getDebugLoc(); 265 unsigned PReg = Loc->getOperand(0).getReg(); 266 unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg); 267 bool isKill = Loc->getOperand(0).isKill(); 268 269 SmallVector<std::pair<unsigned,bool>, 8> Regs; 270 Regs.push_back(std::make_pair(PReg, isKill)); 271 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 272 int NewOffset = MemOps[i].Offset; 273 unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg(); 274 unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); 275 isKill = MemOps[i].MBBI->getOperand(0).isKill(); 276 // AM4 - register numbers in ascending order. 277 // AM5 - consecutive register numbers in ascending order. 278 if (NewOffset == Offset + (int)Size && 279 ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { 280 Offset += Size; 281 Regs.push_back(std::make_pair(Reg, isKill)); 282 PRegNum = RegNum; 283 } else { 284 // Can't merge this in. Try merge the earlier ones first. 285 if (MergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg, 286 Scratch, dl, Regs)) { 287 Merges.push_back(prior(Loc)); 288 for (unsigned j = SIndex; j < i; ++j) { 289 MBB.erase(MemOps[j].MBBI); 290 MemOps[j].Merged = true; 291 } 292 } 293 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 294 MemOps, Merges); 295 return; 296 } 297 298 if (MemOps[i].Position > Pos) { 299 Pos = MemOps[i].Position; 300 Loc = MemOps[i].MBBI; 301 } 302 } 303 304 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; 305 if (MergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg, 306 Scratch, dl, Regs)) { 307 Merges.push_back(prior(Loc)); 308 for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) { 309 MBB.erase(MemOps[i].MBBI); 310 MemOps[i].Merged = true; 311 } 312 } 313 314 return; 315} 316 317static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 318 unsigned Bytes, unsigned Limit, 319 ARMCC::CondCodes Pred, unsigned PredReg){ 320 unsigned MyPredReg = 0; 321 if (!MI) 322 return false; 323 if (MI->getOpcode() != ARM::t2SUBri && 324 MI->getOpcode() != ARM::t2SUBrSPi && 325 MI->getOpcode() != ARM::t2SUBrSPi12 && 326 MI->getOpcode() != ARM::tSUBspi && 327 MI->getOpcode() != ARM::SUBri) 328 return false; 329 330 // Make sure the offset fits in 8 bits. 331 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 332 return false; 333 334 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME 335 return (MI->getOperand(0).getReg() == Base && 336 MI->getOperand(1).getReg() == Base && 337 (MI->getOperand(2).getImm()*Scale) == Bytes && 338 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 339 MyPredReg == PredReg); 340} 341 342static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 343 unsigned Bytes, unsigned Limit, 344 ARMCC::CondCodes Pred, unsigned PredReg){ 345 unsigned MyPredReg = 0; 346 if (!MI) 347 return false; 348 if (MI->getOpcode() != ARM::t2ADDri && 349 MI->getOpcode() != ARM::t2ADDrSPi && 350 MI->getOpcode() != ARM::t2ADDrSPi12 && 351 MI->getOpcode() != ARM::tADDspi && 352 MI->getOpcode() != ARM::ADDri) 353 return false; 354 355 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 356 // Make sure the offset fits in 8 bits. 357 return false; 358 359 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME 360 return (MI->getOperand(0).getReg() == Base && 361 MI->getOperand(1).getReg() == Base && 362 (MI->getOperand(2).getImm()*Scale) == Bytes && 363 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 364 MyPredReg == PredReg); 365} 366 367static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 368 switch (MI->getOpcode()) { 369 default: return 0; 370 case ARM::LDR: 371 case ARM::STR: 372 case ARM::t2LDRi8: 373 case ARM::t2LDRi12: 374 case ARM::t2STRi8: 375 case ARM::t2STRi12: 376 case ARM::FLDS: 377 case ARM::FSTS: 378 return 4; 379 case ARM::FLDD: 380 case ARM::FSTD: 381 return 8; 382 case ARM::LDM: 383 case ARM::STM: 384 case ARM::t2LDM: 385 case ARM::t2STM: 386 return (MI->getNumOperands() - 4) * 4; 387 case ARM::FLDMS: 388 case ARM::FSTMS: 389 case ARM::FLDMD: 390 case ARM::FSTMD: 391 return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; 392 } 393} 394 395/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 396/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible: 397/// 398/// stmia rn, <ra, rb, rc> 399/// rn := rn + 4 * 3; 400/// => 401/// stmia rn!, <ra, rb, rc> 402/// 403/// rn := rn - 4 * 3; 404/// ldmia rn, <ra, rb, rc> 405/// => 406/// ldmdb rn!, <ra, rb, rc> 407bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 408 MachineBasicBlock::iterator MBBI, 409 bool &Advance, 410 MachineBasicBlock::iterator &I) { 411 MachineInstr *MI = MBBI; 412 unsigned Base = MI->getOperand(0).getReg(); 413 unsigned Bytes = getLSMultipleTransferSize(MI); 414 unsigned PredReg = 0; 415 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 416 int Opcode = MI->getOpcode(); 417 bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM || 418 Opcode == ARM::STM || Opcode == ARM::t2STM; 419 420 if (isAM4) { 421 if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) 422 return false; 423 424 // Can't use the updating AM4 sub-mode if the base register is also a dest 425 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 426 for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { 427 if (MI->getOperand(i).getReg() == Base) 428 return false; 429 } 430 431 ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); 432 if (MBBI != MBB.begin()) { 433 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 434 if (Mode == ARM_AM::ia && 435 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 436 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); 437 MBB.erase(PrevMBBI); 438 return true; 439 } else if (Mode == ARM_AM::ib && 440 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 441 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); 442 MBB.erase(PrevMBBI); 443 return true; 444 } 445 } 446 447 if (MBBI != MBB.end()) { 448 MachineBasicBlock::iterator NextMBBI = next(MBBI); 449 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 450 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 451 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 452 if (NextMBBI == I) { 453 Advance = true; 454 ++I; 455 } 456 MBB.erase(NextMBBI); 457 return true; 458 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 459 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 460 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 461 if (NextMBBI == I) { 462 Advance = true; 463 ++I; 464 } 465 MBB.erase(NextMBBI); 466 return true; 467 } 468 } 469 } else { 470 // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops. 471 if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm())) 472 return false; 473 474 ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); 475 unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); 476 if (MBBI != MBB.begin()) { 477 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 478 if (Mode == ARM_AM::ia && 479 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 480 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); 481 MBB.erase(PrevMBBI); 482 return true; 483 } 484 } 485 486 if (MBBI != MBB.end()) { 487 MachineBasicBlock::iterator NextMBBI = next(MBBI); 488 if (Mode == ARM_AM::ia && 489 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 490 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); 491 if (NextMBBI == I) { 492 Advance = true; 493 ++I; 494 } 495 MBB.erase(NextMBBI); 496 } 497 return true; 498 } 499 } 500 501 return false; 502} 503 504static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { 505 switch (Opc) { 506 case ARM::LDR: return ARM::LDR_PRE; 507 case ARM::STR: return ARM::STR_PRE; 508 case ARM::FLDS: return ARM::FLDMS; 509 case ARM::FLDD: return ARM::FLDMD; 510 case ARM::FSTS: return ARM::FSTMS; 511 case ARM::FSTD: return ARM::FSTMD; 512 case ARM::t2LDRi8: 513 case ARM::t2LDRi12: 514 return ARM::t2LDR_PRE; 515 case ARM::t2STRi8: 516 case ARM::t2STRi12: 517 return ARM::t2STR_PRE; 518 default: llvm_unreachable("Unhandled opcode!"); 519 } 520 return 0; 521} 522 523static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { 524 switch (Opc) { 525 case ARM::LDR: return ARM::LDR_POST; 526 case ARM::STR: return ARM::STR_POST; 527 case ARM::FLDS: return ARM::FLDMS; 528 case ARM::FLDD: return ARM::FLDMD; 529 case ARM::FSTS: return ARM::FSTMS; 530 case ARM::FSTD: return ARM::FSTMD; 531 case ARM::t2LDRi8: 532 case ARM::t2LDRi12: 533 return ARM::t2LDR_POST; 534 case ARM::t2STRi8: 535 case ARM::t2STRi12: 536 return ARM::t2STR_POST; 537 default: llvm_unreachable("Unhandled opcode!"); 538 } 539 return 0; 540} 541 542/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 543/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 544bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 545 MachineBasicBlock::iterator MBBI, 546 const TargetInstrInfo *TII, 547 bool &Advance, 548 MachineBasicBlock::iterator &I) { 549 MachineInstr *MI = MBBI; 550 unsigned Base = MI->getOperand(1).getReg(); 551 bool BaseKill = MI->getOperand(1).isKill(); 552 unsigned Bytes = getLSMultipleTransferSize(MI); 553 int Opcode = MI->getOpcode(); 554 DebugLoc dl = MI->getDebugLoc(); 555 bool isAM5 = Opcode == ARM::FLDD || Opcode == ARM::FLDS || 556 Opcode == ARM::FSTD || Opcode == ARM::FSTS; 557 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 558 if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) 559 return false; 560 else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 561 return false; 562 else if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) 563 if (MI->getOperand(2).getImm() != 0) 564 return false; 565 566 bool isLd = isi32Load(Opcode) || Opcode == ARM::FLDS || Opcode == ARM::FLDD; 567 // Can't do the merge if the destination register is the same as the would-be 568 // writeback register. 569 if (isLd && MI->getOperand(0).getReg() == Base) 570 return false; 571 572 unsigned PredReg = 0; 573 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 574 bool DoMerge = false; 575 ARM_AM::AddrOpc AddSub = ARM_AM::add; 576 unsigned NewOpc = 0; 577 // AM2 - 12 bits, thumb2 - 8 bits. 578 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 579 if (MBBI != MBB.begin()) { 580 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 581 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 582 DoMerge = true; 583 AddSub = ARM_AM::sub; 584 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 585 } else if (!isAM5 && 586 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 587 DoMerge = true; 588 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 589 } 590 if (DoMerge) 591 MBB.erase(PrevMBBI); 592 } 593 594 if (!DoMerge && MBBI != MBB.end()) { 595 MachineBasicBlock::iterator NextMBBI = next(MBBI); 596 if (!isAM5 && 597 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 598 DoMerge = true; 599 AddSub = ARM_AM::sub; 600 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 601 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 602 DoMerge = true; 603 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 604 } 605 if (DoMerge) { 606 if (NextMBBI == I) { 607 Advance = true; 608 ++I; 609 } 610 MBB.erase(NextMBBI); 611 } 612 } 613 614 if (!DoMerge) 615 return false; 616 617 bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD; 618 unsigned Offset = 0; 619 if (isAM5) 620 Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) 621 ? ARM_AM::db 622 : ARM_AM::ia, true, (isDPR ? 2 : 1)); 623 else if (isAM2) 624 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 625 else 626 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 627 if (isLd) { 628 if (isAM5) 629 // FLDMS, FLDMD 630 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 631 .addReg(Base, getKillRegState(BaseKill)) 632 .addImm(Offset).addImm(Pred).addReg(PredReg) 633 .addReg(MI->getOperand(0).getReg(), RegState::Define); 634 else if (isAM2) 635 // LDR_PRE, LDR_POST, 636 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 637 .addReg(Base, RegState::Define) 638 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 639 else 640 // t2LDR_PRE, t2LDR_POST 641 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 642 .addReg(Base, RegState::Define) 643 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 644 } else { 645 MachineOperand &MO = MI->getOperand(0); 646 if (isAM5) 647 // FSTMS, FSTMD 648 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset) 649 .addImm(Pred).addReg(PredReg) 650 .addReg(MO.getReg(), getKillRegState(MO.isKill())); 651 else if (isAM2) 652 // STR_PRE, STR_POST 653 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 654 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 655 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 656 else 657 // t2STR_PRE, t2STR_POST 658 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 659 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 660 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 661 } 662 MBB.erase(MBBI); 663 664 return true; 665} 666 667/// isMemoryOp - Returns true if instruction is a memory operations (that this 668/// pass is capable of operating on). 669static bool isMemoryOp(const MachineInstr *MI) { 670 int Opcode = MI->getOpcode(); 671 switch (Opcode) { 672 default: break; 673 case ARM::LDR: 674 case ARM::STR: 675 return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; 676 case ARM::FLDS: 677 case ARM::FSTS: 678 return MI->getOperand(1).isReg(); 679 case ARM::FLDD: 680 case ARM::FSTD: 681 return MI->getOperand(1).isReg(); 682 case ARM::t2LDRi8: 683 case ARM::t2LDRi12: 684 case ARM::t2STRi8: 685 case ARM::t2STRi12: 686 return true; 687 } 688 return false; 689} 690 691/// AdvanceRS - Advance register scavenger to just before the earliest memory 692/// op that is being merged. 693void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 694 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 695 unsigned Position = MemOps[0].Position; 696 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 697 if (MemOps[i].Position < Position) { 698 Position = MemOps[i].Position; 699 Loc = MemOps[i].MBBI; 700 } 701 } 702 703 if (Loc != MBB.begin()) 704 RS->forward(prior(Loc)); 705} 706 707static int getMemoryOpOffset(const MachineInstr *MI) { 708 int Opcode = MI->getOpcode(); 709 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 710 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 711 unsigned NumOperands = MI->getDesc().getNumOperands(); 712 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 713 714 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 715 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 716 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) 717 return OffField; 718 719 int Offset = isAM2 720 ? ARM_AM::getAM2Offset(OffField) 721 : (isAM3 ? ARM_AM::getAM3Offset(OffField) 722 : ARM_AM::getAM5Offset(OffField) * 4); 723 if (isAM2) { 724 if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) 725 Offset = -Offset; 726 } else if (isAM3) { 727 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 728 Offset = -Offset; 729 } else { 730 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 731 Offset = -Offset; 732 } 733 return Offset; 734} 735 736static void InsertLDR_STR(MachineBasicBlock &MBB, 737 MachineBasicBlock::iterator &MBBI, 738 int OffImm, bool isDef, 739 DebugLoc dl, unsigned NewOpc, 740 unsigned Reg, bool RegDeadKill, 741 unsigned BaseReg, bool BaseKill, 742 unsigned OffReg, bool OffKill, 743 ARMCC::CondCodes Pred, unsigned PredReg, 744 const TargetInstrInfo *TII) { 745 unsigned Offset; 746 if (OffImm < 0) 747 Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); 748 else 749 Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); 750 if (isDef) 751 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 752 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 753 .addReg(BaseReg, getKillRegState(BaseKill)) 754 .addReg(OffReg, getKillRegState(OffKill)) 755 .addImm(Offset) 756 .addImm(Pred).addReg(PredReg); 757 else 758 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 759 .addReg(Reg, getKillRegState(RegDeadKill)) 760 .addReg(BaseReg, getKillRegState(BaseKill)) 761 .addReg(OffReg, getKillRegState(OffKill)) 762 .addImm(Offset) 763 .addImm(Pred).addReg(PredReg); 764} 765 766bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 767 MachineBasicBlock::iterator &MBBI) { 768 MachineInstr *MI = &*MBBI; 769 unsigned Opcode = MI->getOpcode(); 770 if (Opcode == ARM::LDRD || Opcode == ARM::STRD) { 771 unsigned EvenReg = MI->getOperand(0).getReg(); 772 unsigned OddReg = MI->getOperand(1).getReg(); 773 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 774 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 775 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) 776 return false; 777 778 bool isLd = Opcode == ARM::LDRD; 779 bool EvenDeadKill = isLd ? 780 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 781 bool OddDeadKill = isLd ? 782 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 783 const MachineOperand &BaseOp = MI->getOperand(2); 784 unsigned BaseReg = BaseOp.getReg(); 785 bool BaseKill = BaseOp.isKill(); 786 const MachineOperand &OffOp = MI->getOperand(3); 787 unsigned OffReg = OffOp.getReg(); 788 bool OffKill = OffOp.isKill(); 789 int OffImm = getMemoryOpOffset(MI); 790 unsigned PredReg = 0; 791 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 792 793 if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { 794 // Ascending register numbers and no offset. It's safe to change it to a 795 // ldm or stm. 796 unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDM : ARM::STM; 797 if (isLd) { 798 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 799 .addReg(BaseReg, getKillRegState(BaseKill)) 800 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 801 .addImm(Pred).addReg(PredReg) 802 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 803 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 804 ++NumLDRD2LDM; 805 } else { 806 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 807 .addReg(BaseReg, getKillRegState(BaseKill)) 808 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 809 .addImm(Pred).addReg(PredReg) 810 .addReg(EvenReg, getKillRegState(EvenDeadKill)) 811 .addReg(OddReg, getKillRegState(OddDeadKill)); 812 ++NumSTRD2STM; 813 } 814 } else { 815 // Split into two instructions. 816 unsigned NewOpc = (Opcode == ARM::LDRD) ? ARM::LDR : ARM::STR; 817 DebugLoc dl = MBBI->getDebugLoc(); 818 // If this is a load and base register is killed, it may have been 819 // re-defed by the load, make sure the first load does not clobber it. 820 if (isLd && 821 (BaseKill || OffKill) && 822 (TRI->regsOverlap(EvenReg, BaseReg) || 823 (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { 824 assert(!TRI->regsOverlap(OddReg, BaseReg) && 825 (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); 826 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, OddReg, OddDeadKill, 827 BaseReg, false, OffReg, false, Pred, PredReg, TII); 828 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, EvenReg, EvenDeadKill, 829 BaseReg, BaseKill, OffReg, OffKill, Pred, PredReg, TII); 830 } else { 831 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 832 EvenReg, EvenDeadKill, BaseReg, false, OffReg, false, 833 Pred, PredReg, TII); 834 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 835 OddReg, OddDeadKill, BaseReg, BaseKill, OffReg, OffKill, 836 Pred, PredReg, TII); 837 } 838 if (isLd) 839 ++NumLDRD2LDR; 840 else 841 ++NumSTRD2STR; 842 } 843 844 MBBI = prior(MBBI); 845 MBB.erase(MI); 846 } 847 return false; 848} 849 850/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 851/// ops of the same base and incrementing offset into LDM / STM ops. 852bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 853 unsigned NumMerges = 0; 854 unsigned NumMemOps = 0; 855 MemOpQueue MemOps; 856 unsigned CurrBase = 0; 857 int CurrOpc = -1; 858 unsigned CurrSize = 0; 859 ARMCC::CondCodes CurrPred = ARMCC::AL; 860 unsigned CurrPredReg = 0; 861 unsigned Position = 0; 862 SmallVector<MachineBasicBlock::iterator,4> Merges; 863 864 RS->enterBasicBlock(&MBB); 865 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 866 while (MBBI != E) { 867 if (FixInvalidRegPairOp(MBB, MBBI)) 868 continue; 869 870 bool Advance = false; 871 bool TryMerge = false; 872 bool Clobber = false; 873 874 bool isMemOp = isMemoryOp(MBBI); 875 if (isMemOp) { 876 int Opcode = MBBI->getOpcode(); 877 unsigned Size = getLSMultipleTransferSize(MBBI); 878 unsigned Base = MBBI->getOperand(1).getReg(); 879 unsigned PredReg = 0; 880 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); 881 int Offset = getMemoryOpOffset(MBBI); 882 // Watch out for: 883 // r4 := ldr [r5] 884 // r5 := ldr [r5, #4] 885 // r6 := ldr [r5, #8] 886 // 887 // The second ldr has effectively broken the chain even though it 888 // looks like the later ldr(s) use the same base register. Try to 889 // merge the ldr's so far, including this one. But don't try to 890 // combine the following ldr(s). 891 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 892 if (CurrBase == 0 && !Clobber) { 893 // Start of a new chain. 894 CurrBase = Base; 895 CurrOpc = Opcode; 896 CurrSize = Size; 897 CurrPred = Pred; 898 CurrPredReg = PredReg; 899 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 900 NumMemOps++; 901 Advance = true; 902 } else { 903 if (Clobber) { 904 TryMerge = true; 905 Advance = true; 906 } 907 908 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 909 // No need to match PredReg. 910 // Continue adding to the queue. 911 if (Offset > MemOps.back().Offset) { 912 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 913 NumMemOps++; 914 Advance = true; 915 } else { 916 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 917 I != E; ++I) { 918 if (Offset < I->Offset) { 919 MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); 920 NumMemOps++; 921 Advance = true; 922 break; 923 } else if (Offset == I->Offset) { 924 // Collision! This can't be merged! 925 break; 926 } 927 } 928 } 929 } 930 } 931 } 932 933 if (Advance) { 934 ++Position; 935 ++MBBI; 936 } else 937 TryMerge = true; 938 939 if (TryMerge) { 940 if (NumMemOps > 1) { 941 // Try to find a free register to use as a new base in case it's needed. 942 // First advance to the instruction just before the start of the chain. 943 AdvanceRS(MBB, MemOps); 944 // Find a scratch register. 945 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); 946 // Process the load / store instructions. 947 RS->forward(prior(MBBI)); 948 949 // Merge ops. 950 Merges.clear(); 951 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 952 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 953 954 // Try folding preceeding/trailing base inc/dec into the generated 955 // LDM/STM ops. 956 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 957 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 958 ++NumMerges; 959 NumMerges += Merges.size(); 960 961 // Try folding preceeding/trailing base inc/dec into those load/store 962 // that were not merged to form LDM/STM ops. 963 for (unsigned i = 0; i != NumMemOps; ++i) 964 if (!MemOps[i].Merged) 965 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 966 ++NumMerges; 967 968 // RS may be pointing to an instruction that's deleted. 969 RS->skipTo(prior(MBBI)); 970 } else if (NumMemOps == 1) { 971 // Try folding preceeding/trailing base inc/dec into the single 972 // load/store. 973 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 974 ++NumMerges; 975 RS->forward(prior(MBBI)); 976 } 977 } 978 979 CurrBase = 0; 980 CurrOpc = -1; 981 CurrSize = 0; 982 CurrPred = ARMCC::AL; 983 CurrPredReg = 0; 984 if (NumMemOps) { 985 MemOps.clear(); 986 NumMemOps = 0; 987 } 988 989 // If iterator hasn't been advanced and this is not a memory op, skip it. 990 // It can't start a new chain anyway. 991 if (!Advance && !isMemOp && MBBI != E) { 992 ++Position; 993 ++MBBI; 994 } 995 } 996 } 997 return NumMerges > 0; 998} 999 1000namespace { 1001 struct OffsetCompare { 1002 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { 1003 int LOffset = getMemoryOpOffset(LHS); 1004 int ROffset = getMemoryOpOffset(RHS); 1005 assert(LHS == RHS || LOffset != ROffset); 1006 return LOffset > ROffset; 1007 } 1008 }; 1009} 1010 1011/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op 1012/// (bx lr) into the preceeding stack restore so it directly restore the value 1013/// of LR into pc. 1014/// ldmfd sp!, {r7, lr} 1015/// bx lr 1016/// => 1017/// ldmfd sp!, {r7, pc} 1018bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1019 if (MBB.empty()) return false; 1020 1021 MachineBasicBlock::iterator MBBI = prior(MBB.end()); 1022 if (MBBI != MBB.begin() && 1023 (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) { 1024 MachineInstr *PrevMI = prior(MBBI); 1025 if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) { 1026 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1027 if (MO.getReg() != ARM::LR) 1028 return false; 1029 unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; 1030 PrevMI->setDesc(TII->get(NewOpc)); 1031 MO.setReg(ARM::PC); 1032 MBB.erase(MBBI); 1033 return true; 1034 } 1035 } 1036 return false; 1037} 1038 1039bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1040 const TargetMachine &TM = Fn.getTarget(); 1041 AFI = Fn.getInfo<ARMFunctionInfo>(); 1042 TII = TM.getInstrInfo(); 1043 TRI = TM.getRegisterInfo(); 1044 RS = new RegScavenger(); 1045 isThumb2 = AFI->isThumb2Function(); 1046 1047 bool Modified = false; 1048 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1049 ++MFI) { 1050 MachineBasicBlock &MBB = *MFI; 1051 Modified |= LoadStoreMultipleOpti(MBB); 1052 Modified |= MergeReturnIntoLDM(MBB); 1053 } 1054 1055 delete RS; 1056 return Modified; 1057} 1058 1059 1060/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1061/// load / stores from consecutive locations close to make it more 1062/// likely they will be combined later. 1063 1064namespace { 1065 struct VISIBILITY_HIDDEN ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1066 static char ID; 1067 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {} 1068 1069 const TargetData *TD; 1070 const TargetInstrInfo *TII; 1071 const TargetRegisterInfo *TRI; 1072 const ARMSubtarget *STI; 1073 MachineRegisterInfo *MRI; 1074 1075 virtual bool runOnMachineFunction(MachineFunction &Fn); 1076 1077 virtual const char *getPassName() const { 1078 return "ARM pre- register allocation load / store optimization pass"; 1079 } 1080 1081 private: 1082 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1083 unsigned &NewOpc, unsigned &EvenReg, 1084 unsigned &OddReg, unsigned &BaseReg, 1085 unsigned &OffReg, unsigned &Offset, 1086 unsigned &PredReg, ARMCC::CondCodes &Pred); 1087 bool RescheduleOps(MachineBasicBlock *MBB, 1088 SmallVector<MachineInstr*, 4> &Ops, 1089 unsigned Base, bool isLd, 1090 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1091 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1092 }; 1093 char ARMPreAllocLoadStoreOpt::ID = 0; 1094} 1095 1096bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1097 TD = Fn.getTarget().getTargetData(); 1098 TII = Fn.getTarget().getInstrInfo(); 1099 TRI = Fn.getTarget().getRegisterInfo(); 1100 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1101 MRI = &Fn.getRegInfo(); 1102 1103 bool Modified = false; 1104 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1105 ++MFI) 1106 Modified |= RescheduleLoadStoreInstrs(MFI); 1107 1108 return Modified; 1109} 1110 1111static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1112 MachineBasicBlock::iterator I, 1113 MachineBasicBlock::iterator E, 1114 SmallPtrSet<MachineInstr*, 4> &MemOps, 1115 SmallSet<unsigned, 4> &MemRegs, 1116 const TargetRegisterInfo *TRI) { 1117 // Are there stores / loads / calls between them? 1118 // FIXME: This is overly conservative. We should make use of alias information 1119 // some day. 1120 SmallSet<unsigned, 4> AddedRegPressure; 1121 while (++I != E) { 1122 if (MemOps.count(&*I)) 1123 continue; 1124 const TargetInstrDesc &TID = I->getDesc(); 1125 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) 1126 return false; 1127 if (isLd && TID.mayStore()) 1128 return false; 1129 if (!isLd) { 1130 if (TID.mayLoad()) 1131 return false; 1132 // It's not safe to move the first 'str' down. 1133 // str r1, [r0] 1134 // strh r5, [r0] 1135 // str r4, [r0, #+4] 1136 if (TID.mayStore()) 1137 return false; 1138 } 1139 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1140 MachineOperand &MO = I->getOperand(j); 1141 if (!MO.isReg()) 1142 continue; 1143 unsigned Reg = MO.getReg(); 1144 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1145 return false; 1146 if (Reg != Base && !MemRegs.count(Reg)) 1147 AddedRegPressure.insert(Reg); 1148 } 1149 } 1150 1151 // Estimate register pressure increase due to the transformation. 1152 if (MemRegs.size() <= 4) 1153 // Ok if we are moving small number of instructions. 1154 return true; 1155 return AddedRegPressure.size() <= MemRegs.size() * 2; 1156} 1157 1158bool 1159ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1160 DebugLoc &dl, 1161 unsigned &NewOpc, unsigned &EvenReg, 1162 unsigned &OddReg, unsigned &BaseReg, 1163 unsigned &OffReg, unsigned &Offset, 1164 unsigned &PredReg, 1165 ARMCC::CondCodes &Pred) { 1166 // FIXME: FLDS / FSTS -> FLDD / FSTD 1167 unsigned Opcode = Op0->getOpcode(); 1168 if (Opcode == ARM::LDR) 1169 NewOpc = ARM::LDRD; 1170 else if (Opcode == ARM::STR) 1171 NewOpc = ARM::STRD; 1172 else 1173 return 0; 1174 1175 // Must sure the base address satisfies i64 ld / st alignment requirement. 1176 if (!Op0->hasOneMemOperand() || 1177 !(*Op0->memoperands_begin())->getValue() || 1178 (*Op0->memoperands_begin())->isVolatile()) 1179 return false; 1180 1181 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1182 unsigned ReqAlign = STI->hasV6Ops() 1183 ? TD->getPrefTypeAlignment( 1184 Type::getInt64Ty(Op0->getParent()->getParent()->getFunction()->getContext())) 1185 : 8; // Pre-v6 need 8-byte align 1186 if (Align < ReqAlign) 1187 return false; 1188 1189 // Then make sure the immediate offset fits. 1190 int OffImm = getMemoryOpOffset(Op0); 1191 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1192 if (OffImm < 0) { 1193 AddSub = ARM_AM::sub; 1194 OffImm = - OffImm; 1195 } 1196 if (OffImm >= 256) // 8 bits 1197 return false; 1198 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1199 1200 EvenReg = Op0->getOperand(0).getReg(); 1201 OddReg = Op1->getOperand(0).getReg(); 1202 if (EvenReg == OddReg) 1203 return false; 1204 BaseReg = Op0->getOperand(1).getReg(); 1205 OffReg = Op0->getOperand(2).getReg(); 1206 Pred = llvm::getInstrPredicate(Op0, PredReg); 1207 dl = Op0->getDebugLoc(); 1208 return true; 1209} 1210 1211bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1212 SmallVector<MachineInstr*, 4> &Ops, 1213 unsigned Base, bool isLd, 1214 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1215 bool RetVal = false; 1216 1217 // Sort by offset (in reverse order). 1218 std::sort(Ops.begin(), Ops.end(), OffsetCompare()); 1219 1220 // The loads / stores of the same base are in order. Scan them from first to 1221 // last and check for the followins: 1222 // 1. Any def of base. 1223 // 2. Any gaps. 1224 while (Ops.size() > 1) { 1225 unsigned FirstLoc = ~0U; 1226 unsigned LastLoc = 0; 1227 MachineInstr *FirstOp = 0; 1228 MachineInstr *LastOp = 0; 1229 int LastOffset = 0; 1230 unsigned LastOpcode = 0; 1231 unsigned LastBytes = 0; 1232 unsigned NumMove = 0; 1233 for (int i = Ops.size() - 1; i >= 0; --i) { 1234 MachineInstr *Op = Ops[i]; 1235 unsigned Loc = MI2LocMap[Op]; 1236 if (Loc <= FirstLoc) { 1237 FirstLoc = Loc; 1238 FirstOp = Op; 1239 } 1240 if (Loc >= LastLoc) { 1241 LastLoc = Loc; 1242 LastOp = Op; 1243 } 1244 1245 unsigned Opcode = Op->getOpcode(); 1246 if (LastOpcode && Opcode != LastOpcode) 1247 break; 1248 1249 int Offset = getMemoryOpOffset(Op); 1250 unsigned Bytes = getLSMultipleTransferSize(Op); 1251 if (LastBytes) { 1252 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1253 break; 1254 } 1255 LastOffset = Offset; 1256 LastBytes = Bytes; 1257 LastOpcode = Opcode; 1258 if (++NumMove == 8) // FIXME: Tune 1259 break; 1260 } 1261 1262 if (NumMove <= 1) 1263 Ops.pop_back(); 1264 else { 1265 SmallPtrSet<MachineInstr*, 4> MemOps; 1266 SmallSet<unsigned, 4> MemRegs; 1267 for (int i = NumMove-1; i >= 0; --i) { 1268 MemOps.insert(Ops[i]); 1269 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1270 } 1271 1272 // Be conservative, if the instructions are too far apart, don't 1273 // move them. We want to limit the increase of register pressure. 1274 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1275 if (DoMove) 1276 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1277 MemOps, MemRegs, TRI); 1278 if (!DoMove) { 1279 for (unsigned i = 0; i != NumMove; ++i) 1280 Ops.pop_back(); 1281 } else { 1282 // This is the new location for the loads / stores. 1283 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1284 while (InsertPos != MBB->end() && MemOps.count(InsertPos)) 1285 ++InsertPos; 1286 1287 // If we are moving a pair of loads / stores, see if it makes sense 1288 // to try to allocate a pair of registers that can form register pairs. 1289 MachineInstr *Op0 = Ops.back(); 1290 MachineInstr *Op1 = Ops[Ops.size()-2]; 1291 unsigned EvenReg = 0, OddReg = 0; 1292 unsigned BaseReg = 0, OffReg = 0, PredReg = 0; 1293 ARMCC::CondCodes Pred = ARMCC::AL; 1294 unsigned NewOpc = 0; 1295 unsigned Offset = 0; 1296 DebugLoc dl; 1297 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1298 EvenReg, OddReg, BaseReg, OffReg, 1299 Offset, PredReg, Pred)) { 1300 Ops.pop_back(); 1301 Ops.pop_back(); 1302 1303 // Form the pair instruction. 1304 if (isLd) { 1305 BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc)) 1306 .addReg(EvenReg, RegState::Define) 1307 .addReg(OddReg, RegState::Define) 1308 .addReg(BaseReg).addReg(0).addImm(Offset) 1309 .addImm(Pred).addReg(PredReg); 1310 ++NumLDRDFormed; 1311 } else { 1312 BuildMI(*MBB, InsertPos, dl, TII->get(NewOpc)) 1313 .addReg(EvenReg) 1314 .addReg(OddReg) 1315 .addReg(BaseReg).addReg(0).addImm(Offset) 1316 .addImm(Pred).addReg(PredReg); 1317 ++NumSTRDFormed; 1318 } 1319 MBB->erase(Op0); 1320 MBB->erase(Op1); 1321 1322 // Add register allocation hints to form register pairs. 1323 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 1324 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 1325 } else { 1326 for (unsigned i = 0; i != NumMove; ++i) { 1327 MachineInstr *Op = Ops.back(); 1328 Ops.pop_back(); 1329 MBB->splice(InsertPos, MBB, Op); 1330 } 1331 } 1332 1333 NumLdStMoved += NumMove; 1334 RetVal = true; 1335 } 1336 } 1337 } 1338 1339 return RetVal; 1340} 1341 1342bool 1343ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 1344 bool RetVal = false; 1345 1346 DenseMap<MachineInstr*, unsigned> MI2LocMap; 1347 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 1348 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 1349 SmallVector<unsigned, 4> LdBases; 1350 SmallVector<unsigned, 4> StBases; 1351 1352 unsigned Loc = 0; 1353 MachineBasicBlock::iterator MBBI = MBB->begin(); 1354 MachineBasicBlock::iterator E = MBB->end(); 1355 while (MBBI != E) { 1356 for (; MBBI != E; ++MBBI) { 1357 MachineInstr *MI = MBBI; 1358 const TargetInstrDesc &TID = MI->getDesc(); 1359 if (TID.isCall() || TID.isTerminator()) { 1360 // Stop at barriers. 1361 ++MBBI; 1362 break; 1363 } 1364 1365 MI2LocMap[MI] = Loc++; 1366 if (!isMemoryOp(MI)) 1367 continue; 1368 unsigned PredReg = 0; 1369 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) 1370 continue; 1371 1372 int Opcode = MI->getOpcode(); 1373 bool isLd = Opcode == ARM::LDR || 1374 Opcode == ARM::FLDS || Opcode == ARM::FLDD; 1375 unsigned Base = MI->getOperand(1).getReg(); 1376 int Offset = getMemoryOpOffset(MI); 1377 1378 bool StopHere = false; 1379 if (isLd) { 1380 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1381 Base2LdsMap.find(Base); 1382 if (BI != Base2LdsMap.end()) { 1383 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1384 if (Offset == getMemoryOpOffset(BI->second[i])) { 1385 StopHere = true; 1386 break; 1387 } 1388 } 1389 if (!StopHere) 1390 BI->second.push_back(MI); 1391 } else { 1392 SmallVector<MachineInstr*, 4> MIs; 1393 MIs.push_back(MI); 1394 Base2LdsMap[Base] = MIs; 1395 LdBases.push_back(Base); 1396 } 1397 } else { 1398 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1399 Base2StsMap.find(Base); 1400 if (BI != Base2StsMap.end()) { 1401 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1402 if (Offset == getMemoryOpOffset(BI->second[i])) { 1403 StopHere = true; 1404 break; 1405 } 1406 } 1407 if (!StopHere) 1408 BI->second.push_back(MI); 1409 } else { 1410 SmallVector<MachineInstr*, 4> MIs; 1411 MIs.push_back(MI); 1412 Base2StsMap[Base] = MIs; 1413 StBases.push_back(Base); 1414 } 1415 } 1416 1417 if (StopHere) { 1418 // Found a duplicate (a base+offset combination that's seen earlier). 1419 // Backtrack. 1420 --Loc; 1421 break; 1422 } 1423 } 1424 1425 // Re-schedule loads. 1426 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 1427 unsigned Base = LdBases[i]; 1428 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; 1429 if (Lds.size() > 1) 1430 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 1431 } 1432 1433 // Re-schedule stores. 1434 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 1435 unsigned Base = StBases[i]; 1436 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; 1437 if (Sts.size() > 1) 1438 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 1439 } 1440 1441 if (MBBI != E) { 1442 Base2LdsMap.clear(); 1443 Base2StsMap.clear(); 1444 LdBases.clear(); 1445 StBases.clear(); 1446 } 1447 } 1448 1449 return RetVal; 1450} 1451 1452 1453/// createARMLoadStoreOptimizationPass - returns an instance of the load / store 1454/// optimization pass. 1455FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 1456 if (PreAlloc) 1457 return new ARMPreAllocLoadStoreOpt(); 1458 return new ARMLoadStoreOpt(); 1459} 1460