1dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// 236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// 336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// The LLVM Compiler Infrastructure 436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// 536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// This file is distributed under the University of Illinois Open Source 636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// License. See LICENSE.TXT for details. 736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// 836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines//===----------------------------------------------------------------------===// 936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// When profitable, replace GPR targeting i64 instructions with their 1036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined 1136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// as minimizing the number of cross-class register copies. 1236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines//===----------------------------------------------------------------------===// 1336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 1436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines//===----------------------------------------------------------------------===// 1536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// TODO: Graph based predicate heuristics. 1636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// Walking the instruction list linearly will get many, perhaps most, of 17dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines// the cases, but to do a truly thorough job of this, we need a more 1836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// wholistic approach. 1936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// 2036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// This optimization is very similar in spirit to the register allocator's 2136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// spill placement, only here we're determining where to place cross-class 2236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// register copies rather than spills. As such, a similar approach is 2336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// called for. 2436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// 2536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// We want to build up a set of graphs of all instructions which are candidates 2636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// for transformation along with instructions which generate their inputs and 2736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// consume their outputs. For each edge in the graph, we assign a weight 2836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// based on whether there is a copy required there (weight zero if not) and 2936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// the block frequency of the block containing the defining or using 3036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// instruction, whichever is less. Our optimization is then a graph problem 3136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// to minimize the total weight of all the graphs, then transform instructions 3236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// and add or remove copy instructions as called for to implement the 3336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// solution. 3436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines//===----------------------------------------------------------------------===// 3536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 36dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines#include "AArch64.h" 37dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines#include "AArch64InstrInfo.h" 38dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines#include "AArch64RegisterInfo.h" 3936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/ADT/Statistic.h" 4036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/CodeGen/MachineFunctionPass.h" 4136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/CodeGen/MachineFunction.h" 4236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/CodeGen/MachineInstr.h" 4336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/CodeGen/MachineInstrBuilder.h" 4436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/CodeGen/MachineRegisterInfo.h" 4536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/Support/CommandLine.h" 4636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/Support/Debug.h" 4736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "llvm/Support/raw_ostream.h" 4836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesusing namespace llvm; 4936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 50dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines#define DEBUG_TYPE "aarch64-simd-scalar" 51dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines 5236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// Allow forcing all i64 operations with equivalent SIMD instructions to use 5336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// them. For stress-testing the transformation function. 5436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic cl::opt<bool> 55dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen HinesTransformAll("aarch64-simd-scalar-force-all", 5636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines cl::desc("Force use of AdvSIMD scalar instructions everywhere"), 5736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines cl::init(false), cl::Hidden); 5836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 5936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen HinesSTATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); 6036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen HinesSTATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); 6136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen HinesSTATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); 6236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 6336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesnamespace { 64dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hinesclass AArch64AdvSIMDScalar : public MachineFunctionPass { 6536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineRegisterInfo *MRI; 66dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines const AArch64InstrInfo *TII; 6736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 6836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesprivate: 6936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // isProfitableToTransform - Predicate function to determine whether an 7036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // instruction should be transformed to its equivalent AdvSIMD scalar 7136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. 7236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines bool isProfitableToTransform(const MachineInstr *MI) const; 7336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 74dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines // transformInstruction - Perform the transformation of an instruction 7536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs 7636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // to be the correct register class, minimizing cross-class copies. 7736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines void transformInstruction(MachineInstr *MI); 7836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 7936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // processMachineBasicBlock - Main optimzation loop. 8036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines bool processMachineBasicBlock(MachineBasicBlock *MBB); 8136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 8236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinespublic: 8336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines static char ID; // Pass identification, replacement for typeid. 84dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {} 8536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 86dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines bool runOnMachineFunction(MachineFunction &F) override; 8736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 88dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines const char *getPassName() const override { 89dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return "AdvSIMD Scalar Operation Optimization"; 9036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 9136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 92dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines void getAnalysisUsage(AnalysisUsage &AU) const override { 9336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines AU.setPreservesCFG(); 9436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineFunctionPass::getAnalysisUsage(AU); 9536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 9636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines}; 97dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hineschar AArch64AdvSIMDScalar::ID = 0; 9836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} // end anonymous namespace 9936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 10036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic bool isGPR64(unsigned Reg, unsigned SubReg, 10136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines const MachineRegisterInfo *MRI) { 10236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (SubReg) 10336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return false; 10436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (TargetRegisterInfo::isVirtualRegister(Reg)) 105dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); 106dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return AArch64::GPR64RegClass.contains(Reg); 10736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 10836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 10936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic bool isFPR64(unsigned Reg, unsigned SubReg, 11036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines const MachineRegisterInfo *MRI) { 11136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (TargetRegisterInfo::isVirtualRegister(Reg)) 112dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && 11336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines SubReg == 0) || 114dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && 115dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines SubReg == AArch64::dsub); 116dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines // Physical register references just check the register class directly. 117dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) || 118dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub); 11936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 12036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 12136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 12236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// copy instruction. Return zero_reg if the instruction is not a copy. 12336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic unsigned getSrcFromCopy(const MachineInstr *MI, 12436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines const MachineRegisterInfo *MRI, 12536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned &SubReg) { 12636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines SubReg = 0; 12736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // The "FMOV Xd, Dn" instruction is the typical form. 128dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines if (MI->getOpcode() == AArch64::FMOVDXr || 129dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines MI->getOpcode() == AArch64::FMOVXDr) 13036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return MI->getOperand(1).getReg(); 13136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see 13236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // these at this stage, but it's easy to check for. 133dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { 134dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines SubReg = AArch64::dsub; 13536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return MI->getOperand(1).getReg(); 13636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 13736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Or just a plain COPY instruction. This can be directly to/from FPR64, 13836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // or it can be a dsub subreg reference to an FPR128. 139dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines if (MI->getOpcode() == AArch64::COPY) { 14036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), 14136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI) && 14236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) 14336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return MI->getOperand(1).getReg(); 14436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), 14536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI) && 14636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), 14736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI)) { 148dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines SubReg = MI->getOperand(1).getSubReg(); 14936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return MI->getOperand(1).getReg(); 15036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 15136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 15236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 15336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Otherwise, this is some other kind of instruction. 15436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return 0; 15536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 15636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 15736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent 15836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// that we're considering transforming to, return that AdvSIMD opcode. For all 15936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// others, return the original opcode. 16036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic int getTransformOpcode(unsigned Opc) { 16136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines switch (Opc) { 16236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines default: 16336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines break; 16436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // FIXME: Lots more possibilities. 165dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines case AArch64::ADDXrr: 166dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return AArch64::ADDv1i64; 167dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines case AArch64::SUBXrr: 168dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return AArch64::SUBv1i64; 16936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 17036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // No AdvSIMD equivalent, so just return the original opcode. 17136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return Opc; 17236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 17336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 17436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic bool isTransformable(const MachineInstr *MI) { 17536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines int Opc = MI->getOpcode(); 17636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return Opc != getTransformOpcode(Opc); 17736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 17836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 17936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// isProfitableToTransform - Predicate function to determine whether an 18036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// instruction should be transformed to its equivalent AdvSIMD scalar 18136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. 182dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hinesbool 183dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen HinesAArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { 18436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If this instruction isn't eligible to be transformed (no SIMD equivalent), 18536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // early exit since that's the common case. 18636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!isTransformable(MI)) 18736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return false; 18836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 18936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Count the number of copies we'll need to add and approximate the number 19036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // of copies that a transform will enable us to remove. 19136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned NumNewCopies = 3; 19236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned NumRemovableCopies = 0; 19336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 19436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned OrigSrc0 = MI->getOperand(1).getReg(); 19536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned OrigSrc1 = MI->getOperand(2).getReg(); 19636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned Src0 = 0, SubReg0; 19736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned Src1 = 0, SubReg1; 19836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!MRI->def_empty(OrigSrc0)) { 19936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineRegisterInfo::def_instr_iterator Def = 20036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI->def_instr_begin(OrigSrc0); 20136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 20236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); 20336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If the source was from a copy, we don't need to insert a new copy. 20436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (Src0) 20536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines --NumNewCopies; 20636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If there are no other users of the original source, we can delete 20736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // that instruction. 20836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) 20936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumRemovableCopies; 21036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 21136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!MRI->def_empty(OrigSrc1)) { 21236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineRegisterInfo::def_instr_iterator Def = 21336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI->def_instr_begin(OrigSrc1); 21436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 21536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); 21636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (Src1) 21736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines --NumNewCopies; 21836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If there are no other users of the original source, we can delete 21936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // that instruction. 22036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) 22136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumRemovableCopies; 22236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 22336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 22436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If any of the uses of the original instructions is a cross class copy, 22536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // that's a copy that will be removable if we transform. Likewise, if 22636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // any of the uses is a transformable instruction, it's likely the tranforms 22736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // will chain, enabling us to save a copy there, too. This is an aggressive 22836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // heuristic that approximates the graph based cost analysis described above. 22936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned Dst = MI->getOperand(0).getReg(); 23036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines bool AllUsesAreCopies = true; 23136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines for (MachineRegisterInfo::use_instr_nodbg_iterator 23236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Use = MRI->use_instr_nodbg_begin(Dst), 23336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines E = MRI->use_instr_nodbg_end(); 23436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Use != E; ++Use) { 23536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned SubReg; 23636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) 23736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumRemovableCopies; 23836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If the use is an INSERT_SUBREG, that's still something that can 23936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's 24036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // preferable to have it use the FPR64 in most cases, as if the source 24136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely. 24236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Ditto for a lane insert. 243dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines else if (Use->getOpcode() == AArch64::INSERT_SUBREG || 244dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines Use->getOpcode() == AArch64::INSvi64gpr) 24536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ; 24636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines else 24736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines AllUsesAreCopies = false; 24836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 24936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If all of the uses of the original destination register are copies to 25036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // FPR64, then we won't end up having a new copy back to GPR64 either. 25136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (AllUsesAreCopies) 25236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines --NumNewCopies; 25336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 254dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines // If a transform will not increase the number of cross-class copies required, 25536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // return true. 25636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (NumNewCopies <= NumRemovableCopies) 25736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return true; 25836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 25936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Finally, even if we otherwise wouldn't transform, check if we're forcing 26036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // transformation of everything. 26136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return TransformAll; 26236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 26336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 264dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hinesstatic MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI, 26536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned Dst, unsigned Src, bool IsKill) { 26636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineInstrBuilder MIB = 267dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), 26836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Dst) 26936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines .addReg(Src, getKillRegState(IsKill)); 27036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines DEBUG(dbgs() << " adding copy: " << *MIB); 27136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumCopiesInserted; 27236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return MIB; 27336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 27436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 275dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines// transformInstruction - Perform the transformation of an instruction 27636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs 27736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// to be the correct register class, minimizing cross-class copies. 278dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hinesvoid AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { 27936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines DEBUG(dbgs() << "Scalar transform: " << *MI); 28036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 28136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineBasicBlock *MBB = MI->getParent(); 28236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines int OldOpc = MI->getOpcode(); 28336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines int NewOpc = getTransformOpcode(OldOpc); 28436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(OldOpc != NewOpc && "transform an instruction to itself?!"); 28536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 28636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Check if we need a copy for the source registers. 28736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned OrigSrc0 = MI->getOperand(1).getReg(); 28836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned OrigSrc1 = MI->getOperand(2).getReg(); 28936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned Src0 = 0, SubReg0; 29036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines unsigned Src1 = 0, SubReg1; 29136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!MRI->def_empty(OrigSrc0)) { 29236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineRegisterInfo::def_instr_iterator Def = 29336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI->def_instr_begin(OrigSrc0); 29436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 29536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); 29636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If there are no other users of the original source, we can delete 29736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // that instruction. 29836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { 29936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(Src0 && "Can't delete copy w/o a valid original source!"); 30036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Def->eraseFromParent(); 30136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumCopiesDeleted; 30236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 30336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 30436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!MRI->def_empty(OrigSrc1)) { 30536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineRegisterInfo::def_instr_iterator Def = 30636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI->def_instr_begin(OrigSrc1); 30736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); 30836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); 30936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If there are no other users of the original source, we can delete 31036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // that instruction. 31136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { 31236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines assert(Src1 && "Can't delete copy w/o a valid original source!"); 31336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Def->eraseFromParent(); 31436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumCopiesDeleted; 31536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 31636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 31736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // If we weren't able to reference the original source directly, create a 31836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // copy. 31936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!Src0) { 32036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines SubReg0 = 0; 321dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 32236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines insertCopy(TII, MI, Src0, OrigSrc0, true); 32336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 32436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (!Src1) { 32536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines SubReg1 = 0; 326dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 32736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines insertCopy(TII, MI, Src1, OrigSrc1, true); 32836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 32936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 33036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Create a vreg for the destination. 33136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // FIXME: No need to do this if the ultimate user expects an FPR64. 33236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Check for that and avoid the copy if possible. 333dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); 33436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 33536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // For now, all of the new instructions have the same simple three-register 33636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // form, so no need to special case based on what instruction we're 33736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // building. 33836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) 33936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines .addReg(Src0, getKillRegState(true), SubReg0) 34036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines .addReg(Src1, getKillRegState(true), SubReg1); 34136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 34236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Now copy the result back out to a GPR. 34336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // FIXME: Try to avoid this if all uses could actually just use the FPR64 34436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // directly. 34536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); 34636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 34736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Erase the old instruction. 34836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MI->eraseFromParent(); 34936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 35036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++NumScalarInsnsUsed; 35136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 35236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 35336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// processMachineBasicBlock - Main optimzation loop. 354dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hinesbool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { 35536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines bool Changed = false; 35636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { 35736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MachineInstr *MI = I; 35836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines ++I; 35936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (isProfitableToTransform(MI)) { 36036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines transformInstruction(MI); 36136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Changed = true; 36236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 36336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines } 36436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return Changed; 36536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 36636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 36736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// runOnMachineFunction - Pass entry point from PassManager. 368dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hinesbool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { 36936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines bool Changed = false; 370dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); 37136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 37236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines const TargetMachine &TM = mf.getTarget(); 37336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines MRI = &mf.getRegInfo(); 374dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo()); 37536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 37636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines // Just check things on a one-block-at-a-time basis. 37736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) 37836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines if (processMachineBasicBlock(I)) 37936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines Changed = true; 38036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines return Changed; 38136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 38236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines 383dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine 38436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// to add the pass to the PassManager. 385dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen HinesFunctionPass *llvm::createAArch64AdvSIMDScalar() { 386dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines return new AArch64AdvSIMDScalar(); 38736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines} 388