X86VZeroUpper.cpp revision 5a96b3dad2f634c9081c8b2b6c2575441dc5a2bd
1dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 2dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// 3dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// The LLVM Compiler Infrastructure 4dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// 57ed47a13356daed2a34cd2209a31f92552e3bdd8Chris Lattner// This file is distributed under the University of Illinois Open Source 67ed47a13356daed2a34cd2209a31f92552e3bdd8Chris Lattner// License. See LICENSE.TXT for details. 7dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// 8dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner//===----------------------------------------------------------------------===// 9dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// 10dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// This file defines the pass which inserts x86 AVX vzeroupper instructions 11dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// before calls to SSE encoded functions. This avoids transition latency 12dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// penalty when tranfering control between AVX encoded instructions and old 13dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// SSE encoding mode. 14dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner// 15dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner//===----------------------------------------------------------------------===// 16dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner 17dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#define DEBUG_TYPE "x86-vzeroupper" 1804087d069a17265b964b30e8210262bbdbc4fbecDaniel Dunbar#include "X86.h" 19dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "X86InstrInfo.h" 20dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/ADT/Statistic.h" 21dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/CodeGen/MachineFunctionPass.h" 22dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/CodeGen/MachineInstrBuilder.h" 23dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/CodeGen/MachineRegisterInfo.h" 24dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/CodeGen/Passes.h" 25dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/Support/Debug.h" 26dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/Support/raw_ostream.h" 27dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner#include "llvm/Target/TargetInstrInfo.h" 28dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattnerusing namespace llvm; 29dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner 30dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris LattnerSTATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 31dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner 32dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattnernamespace { 333a54b3dc87a581c203b18050b4f787b4ca28a12cMisha Brukman struct VZeroUpperInserter : public MachineFunctionPass { 34dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner static char ID; 35dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner VZeroUpperInserter() : MachineFunctionPass(ID) {} 36dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner 373a54b3dc87a581c203b18050b4f787b4ca28a12cMisha Brukman virtual bool runOnMachineFunction(MachineFunction &MF); 38dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner 3904087d069a17265b964b30e8210262bbdbc4fbecDaniel Dunbar bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); 4004087d069a17265b964b30e8210262bbdbc4fbecDaniel Dunbar 41983c7fe847dd3f46945f0117ab19345b9c68e88fDaniel Dunbar virtual const char *getPassName() const { return "X86 vzeroupper inserter";} 42983c7fe847dd3f46945f0117ab19345b9c68e88fDaniel Dunbar 43983c7fe847dd3f46945f0117ab19345b9c68e88fDaniel Dunbar private: 443670a01d0be3bab1aa5856fea02a854aee2b65d6Chris Lattner const TargetInstrInfo *TII; // Machine instruction info. 453670a01d0be3bab1aa5856fea02a854aee2b65d6Chris Lattner MachineBasicBlock *MBB; // Current basic block 463670a01d0be3bab1aa5856fea02a854aee2b65d6Chris Lattner 473670a01d0be3bab1aa5856fea02a854aee2b65d6Chris Lattner // Any YMM register live-in to this function? 483670a01d0be3bab1aa5856fea02a854aee2b65d6Chris Lattner bool FnHasLiveInYmm; 49983c7fe847dd3f46945f0117ab19345b9c68e88fDaniel Dunbar 50dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // BBState - Contains the state of each MBB: unknown, clean, dirty 5139db3439bfcdca4073dd513879f8ce12ee8c593bDaniel Dunbar SmallVector<uint8_t, 8> BBState; 520fbdfc3664830e8387c13bf817c44e8b71085142Chris Lattner 530fbdfc3664830e8387c13bf817c44e8b71085142Chris Lattner // BBSolved - Keep track of all MBB which had been already analyzed 540fbdfc3664830e8387c13bf817c44e8b71085142Chris Lattner // and there is no further processing required. 553a54b3dc87a581c203b18050b4f787b4ca28a12cMisha Brukman BitVector BBSolved; 5639db3439bfcdca4073dd513879f8ce12ee8c593bDaniel Dunbar 5739db3439bfcdca4073dd513879f8ce12ee8c593bDaniel Dunbar // Machine Basic Blocks are classified according this pass: 58dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // 59dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state 60703f5291c4f7199a95274df5e3381b36f8faf38cChris Lattner // until the MBB exit there isn't a instruction using YMM to change 61703f5291c4f7199a95274df5e3381b36f8faf38cChris Lattner // the state to dirty, or one of the incoming predecessors is unknown 62703f5291c4f7199a95274df5e3381b36f8faf38cChris Lattner // and there's not a dirty predecessor between them. 63703f5291c4f7199a95274df5e3381b36f8faf38cChris Lattner // 64dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have 653a54b3dc87a581c203b18050b4f787b4ca28a12cMisha Brukman // instructions using YMM and be marked ST_CLEAN, as long as the state 66dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // is cleaned by a vzeroupper before any call. 67dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // 68dd94c8d6b2afb9c33c364ac8f0c8f8ed5d4c04a0Chris Lattner // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a 69 // vzeroupper instruction. 70 // 71 // ST_INIT - Placeholder for an empty state set 72 // 73 enum { 74 ST_UNKNOWN = 0, 75 ST_CLEAN = 1, 76 ST_DIRTY = 2, 77 ST_INIT = 3 78 }; 79 80 // computeState - Given two states, compute the resulting state, in 81 // the following way 82 // 83 // 1) One dirty state yields another dirty state 84 // 2) All states must be clean for the result to be clean 85 // 3) If none above and one unknown, the result state is also unknown 86 // 87 unsigned computeState(unsigned PrevState, unsigned CurState) { 88 if (PrevState == ST_INIT) 89 return CurState; 90 91 if (PrevState == ST_DIRTY || CurState == ST_DIRTY) 92 return ST_DIRTY; 93 94 if (PrevState == ST_CLEAN && CurState == ST_CLEAN) 95 return ST_CLEAN; 96 97 return ST_UNKNOWN; 98 } 99 100 }; 101 char VZeroUpperInserter::ID = 0; 102} 103 104FunctionPass *llvm::createX86IssueVZeroUpperPass() { 105 return new VZeroUpperInserter(); 106} 107 108static bool isYmmReg(unsigned Reg) { 109 if (Reg >= X86::YMM0 && Reg <= X86::YMM15) 110 return true; 111 112 return false; 113} 114 115static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 116 for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 117 E = MRI.livein_end(); I != E; ++I) 118 if (isYmmReg(I->first)) 119 return true; 120 121 return false; 122} 123 124static bool hasYmmReg(MachineInstr *MI) { 125 for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { 126 const MachineOperand &MO = MI->getOperand(i); 127 if (!MO.isReg()) 128 continue; 129 if (MO.isDebug()) 130 continue; 131 if (isYmmReg(MO.getReg())) 132 return true; 133 } 134 return false; 135} 136 137/// runOnMachineFunction - Loop over all of the basic blocks, inserting 138/// vzero upper instructions before function calls. 139bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 140 TII = MF.getTarget().getInstrInfo(); 141 MachineRegisterInfo &MRI = MF.getRegInfo(); 142 bool EverMadeChange = false; 143 144 // Fast check: if the function doesn't use any ymm registers, we don't need 145 // to insert any VZEROUPPER instructions. This is constant-time, so it is 146 // cheap in the common case of no ymm use. 147 bool YMMUsed = false; 148 TargetRegisterClass *RC = X86::VR256RegisterClass; 149 for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); 150 i != e; i++) { 151 if (MRI.isPhysRegUsed(*i)) { 152 YMMUsed = true; 153 break; 154 } 155 } 156 if (!YMMUsed) 157 return EverMadeChange; 158 159 // Pre-compute the existence of any live-in YMM registers to this function 160 FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); 161 162 assert(BBState.empty()); 163 BBState.resize(MF.getNumBlockIDs(), 0); 164 BBSolved.resize(MF.getNumBlockIDs(), 0); 165 166 // Each BB state depends on all predecessors, loop over until everything 167 // converges. (Once we converge, we can implicitly mark everything that is 168 // still ST_UNKNOWN as ST_CLEAN.) 169 while (1) { 170 bool MadeChange = false; 171 172 // Process all basic blocks. 173 for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) 174 MadeChange |= processBasicBlock(MF, *I); 175 176 // If this iteration over the code changed anything, keep iterating. 177 if (!MadeChange) break; 178 EverMadeChange = true; 179 } 180 181 BBState.clear(); 182 BBSolved.clear(); 183 return EverMadeChange; 184} 185 186/// processBasicBlock - Loop over all of the instructions in the basic block, 187/// inserting vzero upper instructions before function calls. 188bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, 189 MachineBasicBlock &BB) { 190 bool Changed = false; 191 unsigned BBNum = BB.getNumber(); 192 MBB = &BB; 193 194 // Don't process already solved BBs 195 if (BBSolved[BBNum]) 196 return false; // No changes 197 198 // Check the state of all predecessors 199 unsigned EntryState = ST_INIT; 200 for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), 201 PE = BB.pred_end(); PI != PE; ++PI) { 202 EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); 203 if (EntryState == ST_DIRTY) 204 break; 205 } 206 207 208 // The entry MBB for the function may set the inital state to dirty if 209 // the function receives any YMM incoming arguments 210 if (MBB == MF.begin()) { 211 EntryState = ST_CLEAN; 212 if (FnHasLiveInYmm) 213 EntryState = ST_DIRTY; 214 } 215 216 // The current state is initialized according to the predecessors 217 unsigned CurState = EntryState; 218 bool BBHasCall = false; 219 220 for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { 221 MachineInstr *MI = I; 222 DebugLoc dl = I->getDebugLoc(); 223 bool isControlFlow = MI->isCall() || MI->isReturn(); 224 225 // Shortcut: don't need to check regular instructions in dirty state. 226 if (!isControlFlow && CurState == ST_DIRTY) 227 continue; 228 229 if (hasYmmReg(MI)) { 230 // We found a ymm-using instruction; this could be an AVX instruction, 231 // or it could be control flow. 232 CurState = ST_DIRTY; 233 continue; 234 } 235 236 // Check for control-flow out of the current function (which might 237 // indirectly execute SSE instructions). 238 if (!isControlFlow) 239 continue; 240 241 BBHasCall = true; 242 243 // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX 244 // registers. This instruction has zero latency. In addition, the processor 245 // changes back to Clean state, after which execution of Intel SSE 246 // instructions or Intel AVX instructions has no transition penalty. Add 247 // the VZEROUPPER instruction before any function call/return that might 248 // execute SSE code. 249 // FIXME: In some cases, we may want to move the VZEROUPPER into a 250 // predecessor block. 251 if (CurState == ST_DIRTY) { 252 // Only insert the VZEROUPPER in case the entry state isn't unknown. 253 // When unknown, only compute the information within the block to have 254 // it available in the exit if possible, but don't change the block. 255 if (EntryState != ST_UNKNOWN) { 256 BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); 257 ++NumVZU; 258 } 259 260 // After the inserted VZEROUPPER the state becomes clean again, but 261 // other YMM may appear before other subsequent calls or even before 262 // the end of the BB. 263 CurState = ST_CLEAN; 264 } 265 } 266 267 DEBUG(dbgs() << "MBB #" << BBNum 268 << ", current state: " << CurState << '\n'); 269 270 // A BB can only be considered solved when we both have done all the 271 // necessary transformations, and have computed the exit state. This happens 272 // in two cases: 273 // 1) We know the entry state: this immediately implies the exit state and 274 // all the necessary transformations. 275 // 2) There are no calls, and and a non-call instruction marks this block: 276 // no transformations are necessary, and we know the exit state. 277 if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) 278 BBSolved[BBNum] = true; 279 280 if (CurState != BBState[BBNum]) 281 Changed = true; 282 283 BBState[BBNum] = CurState; 284 return Changed; 285} 286