1//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// When profitable, replace GPR targeting i64 instructions with their 10// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined 11// as minimizing the number of cross-class register copies. 12//===----------------------------------------------------------------------===// 13 14//===----------------------------------------------------------------------===// 15// TODO: Graph based predicate heuristics. 16// Walking the instruction list linearly will get many, perhaps most, of 17// the cases, but to do a truly thorough job of this, we need a more 18// wholistic approach. 19// 20// This optimization is very similar in spirit to the register allocator's 21// spill placement, only here we're determining where to place cross-class 22// register copies rather than spills. As such, a similar approach is 23// called for. 24// 25// We want to build up a set of graphs of all instructions which are candidates 26// for transformation along with instructions which generate their inputs and 27// consume their outputs. For each edge in the graph, we assign a weight 28// based on whether there is a copy required there (weight zero if not) and 29// the block frequency of the block containing the defining or using 30// instruction, whichever is less. Our optimization is then a graph problem 31// to minimize the total weight of all the graphs, then transform instructions 32// and add or remove copy instructions as called for to implement the 33// solution. 34//===----------------------------------------------------------------------===// 35 36#include "AArch64.h" 37#include "AArch64InstrInfo.h" 38#include "AArch64RegisterInfo.h" 39#include "AArch64Subtarget.h" 40#include "llvm/ADT/Statistic.h" 41#include "llvm/CodeGen/MachineFunction.h" 42#include "llvm/CodeGen/MachineFunctionPass.h" 43#include "llvm/CodeGen/MachineInstr.h" 44#include "llvm/CodeGen/MachineInstrBuilder.h" 45#include "llvm/CodeGen/MachineRegisterInfo.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/raw_ostream.h" 49using namespace llvm; 50 51#define DEBUG_TYPE "aarch64-simd-scalar" 52 53// Allow forcing all i64 operations with equivalent SIMD instructions to use 54// them. For stress-testing the transformation function. 55static cl::opt<bool> 56TransformAll("aarch64-simd-scalar-force-all", 57 cl::desc("Force use of AdvSIMD scalar instructions everywhere"), 58 cl::init(false), cl::Hidden); 59 60STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); 61STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); 62STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); 63 64namespace llvm { 65void initializeAArch64AdvSIMDScalarPass(PassRegistry &); 66} 67 68#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization" 69 70namespace { 71class AArch64AdvSIMDScalar : public MachineFunctionPass { 72 MachineRegisterInfo *MRI; 73 const TargetInstrInfo *TII; 74 75private: 76 // isProfitableToTransform - Predicate function to determine whether an 77 // instruction should be transformed to its equivalent AdvSIMD scalar 78 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. 79 bool isProfitableToTransform(const MachineInstr &MI) const; 80 81 // transformInstruction - Perform the transformation of an instruction 82 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs 83 // to be the correct register class, minimizing cross-class copies. 84 void transformInstruction(MachineInstr &MI); 85 86 // processMachineBasicBlock - Main optimzation loop. 87 bool processMachineBasicBlock(MachineBasicBlock *MBB); 88 89public: 90 static char ID; // Pass identification, replacement for typeid. 91 explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { 92 initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); 93 } 94 95 bool runOnMachineFunction(MachineFunction &F) override; 96 97 const char *getPassName() const override { 98 return AARCH64_ADVSIMD_NAME; 99 } 100 101 void getAnalysisUsage(AnalysisUsage &AU) const override { 102 AU.setPreservesCFG(); 103 MachineFunctionPass::getAnalysisUsage(AU); 104 } 105}; 106char AArch64AdvSIMDScalar::ID = 0; 107} // end anonymous namespace 108 109INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar", 110 AARCH64_ADVSIMD_NAME, false, false) 111 112static bool isGPR64(unsigned Reg, unsigned SubReg, 113 const MachineRegisterInfo *MRI) { 114 if (SubReg) 115 return false; 116 if (TargetRegisterInfo::isVirtualRegister(Reg)) 117 return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); 118 return AArch64::GPR64RegClass.contains(Reg); 119} 120 121static bool isFPR64(unsigned Reg, unsigned SubReg, 122 const MachineRegisterInfo *MRI) { 123 if (TargetRegisterInfo::isVirtualRegister(Reg)) 124 return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && 125 SubReg == 0) || 126 (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && 127 SubReg == AArch64::dsub); 128 // Physical register references just check the register class directly. 129 return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) || 130 (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub); 131} 132 133// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 134// copy instruction. Return zero_reg if the instruction is not a copy. 135static MachineOperand *getSrcFromCopy(MachineInstr *MI, 136 const MachineRegisterInfo *MRI, 137 unsigned &SubReg) { 138 SubReg = 0; 139 // The "FMOV Xd, Dn" instruction is the typical form. 140 if (MI->getOpcode() == AArch64::FMOVDXr || 141 MI->getOpcode() == AArch64::FMOVXDr) 142 return &MI->getOperand(1); 143 // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see 144 // these at this stage, but it's easy to check for. 145 if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { 146 SubReg = AArch64::dsub; 147 return &MI->getOperand(1); 148 } 149 // Or just a plain COPY instruction. This can be directly to/from FPR64, 150 // or it can be a dsub subreg reference to an FPR128. 151 if (MI->getOpcode() == AArch64::COPY) { 152 if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), 153 MRI) && 154 isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) 155 return &MI->getOperand(1); 156 if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), 157 MRI) && 158 isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), 159 MRI)) { 160 SubReg = MI->getOperand(1).getSubReg(); 161 return &MI->getOperand(1); 162 } 163 } 164 165 // Otherwise, this is some other kind of instruction. 166 return nullptr; 167} 168 169// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent 170// that we're considering transforming to, return that AdvSIMD opcode. For all 171// others, return the original opcode. 172static unsigned getTransformOpcode(unsigned Opc) { 173 switch (Opc) { 174 default: 175 break; 176 // FIXME: Lots more possibilities. 177 case AArch64::ADDXrr: 178 return AArch64::ADDv1i64; 179 case AArch64::SUBXrr: 180 return AArch64::SUBv1i64; 181 case AArch64::ANDXrr: 182 return AArch64::ANDv8i8; 183 case AArch64::EORXrr: 184 return AArch64::EORv8i8; 185 case AArch64::ORRXrr: 186 return AArch64::ORRv8i8; 187 } 188 // No AdvSIMD equivalent, so just return the original opcode. 189 return Opc; 190} 191 192static bool isTransformable(const MachineInstr &MI) { 193 unsigned Opc = MI.getOpcode(); 194 return Opc != getTransformOpcode(Opc); 195} 196 197// isProfitableToTransform - Predicate function to determine whether an 198// instruction should be transformed to its equivalent AdvSIMD scalar 199// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. 200bool AArch64AdvSIMDScalar::isProfitableToTransform( 201 const MachineInstr &MI) const { 202 // If this instruction isn't eligible to be transformed (no SIMD equivalent), 203 // early exit since that's the common case. 204 if (!isTransformable(MI)) 205 return false; 206 207 // Count the number of copies we'll need to add and approximate the number 208 // of copies that a transform will enable us to remove. 209 unsigned NumNewCopies = 3; 210 unsigned NumRemovableCopies = 0; 211 212 unsigned OrigSrc0 = MI.getOperand(1).getReg(); 213 unsigned OrigSrc1 = MI.getOperand(2).getReg(); 214 unsigned SubReg0; 215 unsigned SubReg1; 216 if (!MRI->def_empty(OrigSrc0)) { 217 MachineRegisterInfo::def_instr_iterator Def = 218 MRI->def_instr_begin(OrigSrc0); 219 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 220 MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0); 221 // If the source was from a copy, we don't need to insert a new copy. 222 if (MOSrc0) 223 --NumNewCopies; 224 // If there are no other users of the original source, we can delete 225 // that instruction. 226 if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0)) 227 ++NumRemovableCopies; 228 } 229 if (!MRI->def_empty(OrigSrc1)) { 230 MachineRegisterInfo::def_instr_iterator Def = 231 MRI->def_instr_begin(OrigSrc1); 232 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 233 MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1); 234 if (MOSrc1) 235 --NumNewCopies; 236 // If there are no other users of the original source, we can delete 237 // that instruction. 238 if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1)) 239 ++NumRemovableCopies; 240 } 241 242 // If any of the uses of the original instructions is a cross class copy, 243 // that's a copy that will be removable if we transform. Likewise, if 244 // any of the uses is a transformable instruction, it's likely the tranforms 245 // will chain, enabling us to save a copy there, too. This is an aggressive 246 // heuristic that approximates the graph based cost analysis described above. 247 unsigned Dst = MI.getOperand(0).getReg(); 248 bool AllUsesAreCopies = true; 249 for (MachineRegisterInfo::use_instr_nodbg_iterator 250 Use = MRI->use_instr_nodbg_begin(Dst), 251 E = MRI->use_instr_nodbg_end(); 252 Use != E; ++Use) { 253 unsigned SubReg; 254 if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use)) 255 ++NumRemovableCopies; 256 // If the use is an INSERT_SUBREG, that's still something that can 257 // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's 258 // preferable to have it use the FPR64 in most cases, as if the source 259 // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely. 260 // Ditto for a lane insert. 261 else if (Use->getOpcode() == AArch64::INSERT_SUBREG || 262 Use->getOpcode() == AArch64::INSvi64gpr) 263 ; 264 else 265 AllUsesAreCopies = false; 266 } 267 // If all of the uses of the original destination register are copies to 268 // FPR64, then we won't end up having a new copy back to GPR64 either. 269 if (AllUsesAreCopies) 270 --NumNewCopies; 271 272 // If a transform will not increase the number of cross-class copies required, 273 // return true. 274 if (NumNewCopies <= NumRemovableCopies) 275 return true; 276 277 // Finally, even if we otherwise wouldn't transform, check if we're forcing 278 // transformation of everything. 279 return TransformAll; 280} 281 282static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI, 283 unsigned Dst, unsigned Src, bool IsKill) { 284 MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 285 TII->get(AArch64::COPY), Dst) 286 .addReg(Src, getKillRegState(IsKill)); 287 DEBUG(dbgs() << " adding copy: " << *MIB); 288 ++NumCopiesInserted; 289 return MIB; 290} 291 292// transformInstruction - Perform the transformation of an instruction 293// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs 294// to be the correct register class, minimizing cross-class copies. 295void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { 296 DEBUG(dbgs() << "Scalar transform: " << MI); 297 298 MachineBasicBlock *MBB = MI.getParent(); 299 unsigned OldOpc = MI.getOpcode(); 300 unsigned NewOpc = getTransformOpcode(OldOpc); 301 assert(OldOpc != NewOpc && "transform an instruction to itself?!"); 302 303 // Check if we need a copy for the source registers. 304 unsigned OrigSrc0 = MI.getOperand(1).getReg(); 305 unsigned OrigSrc1 = MI.getOperand(2).getReg(); 306 unsigned Src0 = 0, SubReg0; 307 unsigned Src1 = 0, SubReg1; 308 bool KillSrc0 = false, KillSrc1 = false; 309 if (!MRI->def_empty(OrigSrc0)) { 310 MachineRegisterInfo::def_instr_iterator Def = 311 MRI->def_instr_begin(OrigSrc0); 312 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 313 MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0); 314 // If there are no other users of the original source, we can delete 315 // that instruction. 316 if (MOSrc0) { 317 Src0 = MOSrc0->getReg(); 318 KillSrc0 = MOSrc0->isKill(); 319 // Src0 is going to be reused, thus, it cannot be killed anymore. 320 MOSrc0->setIsKill(false); 321 if (MRI->hasOneNonDBGUse(OrigSrc0)) { 322 assert(MOSrc0 && "Can't delete copy w/o a valid original source!"); 323 Def->eraseFromParent(); 324 ++NumCopiesDeleted; 325 } 326 } 327 } 328 if (!MRI->def_empty(OrigSrc1)) { 329 MachineRegisterInfo::def_instr_iterator Def = 330 MRI->def_instr_begin(OrigSrc1); 331 assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 332 MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1); 333 // If there are no other users of the original source, we can delete 334 // that instruction. 335 if (MOSrc1) { 336 Src1 = MOSrc1->getReg(); 337 KillSrc1 = MOSrc1->isKill(); 338 // Src0 is going to be reused, thus, it cannot be killed anymore. 339 MOSrc1->setIsKill(false); 340 if (MRI->hasOneNonDBGUse(OrigSrc1)) { 341 assert(MOSrc1 && "Can't delete copy w/o a valid original source!"); 342 Def->eraseFromParent(); 343 ++NumCopiesDeleted; 344 } 345 } 346 } 347 // If we weren't able to reference the original source directly, create a 348 // copy. 349 if (!Src0) { 350 SubReg0 = 0; 351 Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 352 insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0); 353 KillSrc0 = true; 354 } 355 if (!Src1) { 356 SubReg1 = 0; 357 Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 358 insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1); 359 KillSrc1 = true; 360 } 361 362 // Create a vreg for the destination. 363 // FIXME: No need to do this if the ultimate user expects an FPR64. 364 // Check for that and avoid the copy if possible. 365 unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 366 367 // For now, all of the new instructions have the same simple three-register 368 // form, so no need to special case based on what instruction we're 369 // building. 370 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst) 371 .addReg(Src0, getKillRegState(KillSrc0), SubReg0) 372 .addReg(Src1, getKillRegState(KillSrc1), SubReg1); 373 374 // Now copy the result back out to a GPR. 375 // FIXME: Try to avoid this if all uses could actually just use the FPR64 376 // directly. 377 insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true); 378 379 // Erase the old instruction. 380 MI.eraseFromParent(); 381 382 ++NumScalarInsnsUsed; 383} 384 385// processMachineBasicBlock - Main optimzation loop. 386bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { 387 bool Changed = false; 388 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { 389 MachineInstr &MI = *I++; 390 if (isProfitableToTransform(MI)) { 391 transformInstruction(MI); 392 Changed = true; 393 } 394 } 395 return Changed; 396} 397 398// runOnMachineFunction - Pass entry point from PassManager. 399bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { 400 bool Changed = false; 401 DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); 402 403 if (skipFunction(*mf.getFunction())) 404 return false; 405 406 MRI = &mf.getRegInfo(); 407 TII = mf.getSubtarget().getInstrInfo(); 408 409 // Just check things on a one-block-at-a-time basis. 410 for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) 411 if (processMachineBasicBlock(&*I)) 412 Changed = true; 413 return Changed; 414} 415 416// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine 417// to add the pass to the PassManager. 418FunctionPass *llvm::createAArch64AdvSIMDScalar() { 419 return new AArch64AdvSIMDScalar(); 420} 421