13bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
23bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//
33bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//                     The LLVM Compiler Infrastructure
43bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//
53bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes// This file is distributed under the University of Illinois Open Source
63bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes// License. See LICENSE.TXT for details.
73bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//
83bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//===----------------------------------------------------------------------===//
93bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//
103bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes// This file defines the pass which inserts x86 AVX vzeroupper instructions
113bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes// before calls to SSE encoded functions. This avoids transition latency
123bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes// penalty when tranfering control between AVX encoded instructions and old
133bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes// SSE encoding mode.
143bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//
153bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes//===----------------------------------------------------------------------===//
163bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
173bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "X86.h"
183bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "X86InstrInfo.h"
1936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines#include "X86Subtarget.h"
203bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "llvm/ADT/Statistic.h"
213bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "llvm/CodeGen/MachineFunctionPass.h"
223bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "llvm/CodeGen/MachineInstrBuilder.h"
23bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman#include "llvm/CodeGen/MachineRegisterInfo.h"
243bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "llvm/CodeGen/Passes.h"
25bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman#include "llvm/Support/Debug.h"
26bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman#include "llvm/Support/raw_ostream.h"
273bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes#include "llvm/Target/TargetInstrInfo.h"
283bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopesusing namespace llvm;
293bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
30dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines#define DEBUG_TYPE "x86-vzeroupper"
31dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines
323bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso LopesSTATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
333bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
343bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopesnamespace {
353bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
3636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  class VZeroUpperInserter : public MachineFunctionPass {
3736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  public:
383bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
3936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    VZeroUpperInserter() : MachineFunctionPass(ID) {}
4036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    bool runOnMachineFunction(MachineFunction &MF) override;
4136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    const char *getPassName() const override {return "X86 vzeroupper inserter";}
423bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
433bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes  private:
44bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
4536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    void processBasicBlock(MachineBasicBlock &MBB);
4636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    void insertVZeroUpper(MachineBasicBlock::iterator I,
4736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines                          MachineBasicBlock &MBB);
4836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    void addDirtySuccessor(MachineBasicBlock &MBB);
4936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
5036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
5136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    static const char* getBlockExitStateName(BlockExitState ST);
5236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
5336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // Core algorithm state:
5436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // BlockState - Each block is either:
5536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
5636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                   vzeroupper instructions in this block.
5736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
5836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                  block that will ensure that YMM is clean on exit.
5936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
6036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                  subsequent vzeroupper in the block clears it.
61bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    //
6236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // AddedToDirtySuccessors - This flag is raised when a block is added to the
6336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                          DirtySuccessors list to ensure that it's not
6436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                          added multiple times.
65bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    //
6636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // FirstUnguardedCall - Records the location of the first unguarded call in
6736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                      each basic block that may need to be guarded by a
6836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                      vzeroupper. We won't know whether it actually needs
6936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                      to be guarded until we discover a predecessor that
7036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    //                      is DIRTY_OUT.
7136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    struct BlockState {
7236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
7336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      BlockExitState ExitState;
7436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      bool AddedToDirtySuccessors;
7536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      MachineBasicBlock::iterator FirstUnguardedCall;
76bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    };
7736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    typedef SmallVector<BlockState, 8> BlockStateMap;
7836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
79bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
8036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    BlockStateMap BlockStates;
8136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    DirtySuccessorsWorkList DirtySuccessors;
8236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    bool EverMadeChange;
8336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    const TargetInstrInfo *TII;
84bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
8536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    static char ID;
863bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes  };
8736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
883bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes  char VZeroUpperInserter::ID = 0;
893bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes}
903bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
913bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso LopesFunctionPass *llvm::createX86IssueVZeroUpperPass() {
923bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes  return new VZeroUpperInserter();
933bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes}
943bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
9536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesconst char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
9636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  switch (ST) {
9736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    case PASS_THROUGH: return "Pass-through";
9836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    case EXITS_DIRTY: return "Exits-dirty";
9936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    case EXITS_CLEAN: return "Exits-clean";
10036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  }
10136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  llvm_unreachable("Invalid block exit state.");
102e3809eed34f000581a464689596eefde2a6d1f24Elena Demikhovsky}
103bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
10436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic bool isYmmReg(unsigned Reg) {
10536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
106bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman}
107bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
108bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedmanstatic bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
109bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
110bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman       E = MRI.livein_end(); I != E; ++I)
11136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (isYmmReg(I->first))
112bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      return true;
113bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
114bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman  return false;
115bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman}
116bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
117d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovskystatic bool clobbersAllYmmRegs(const MachineOperand &MO) {
11836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
119d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky    if (!MO.clobbersPhysReg(reg))
120d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky      return false;
121d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky  }
122d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky  return true;
123d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky}
124d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky
125bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedmanstatic bool hasYmmReg(MachineInstr *MI) {
126df8de92083e9cc97999e9f2f7bc7ef1df9ac6258Craig Topper  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
127bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    const MachineOperand &MO = MI->getOperand(i);
128d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
129d29804f80d1cc26ea552b58693ce883f5b13de7aElena Demikhovsky      return true;
130bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    if (!MO.isReg())
131bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      continue;
132bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    if (MO.isDebug())
133bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      continue;
134bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    if (isYmmReg(MO.getReg()))
135bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      return true;
136bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman  }
137bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman  return false;
138bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman}
139bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
1402990853ea8bf4888b179ac6c493836b83769e87bBill Wendling/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
1412990853ea8bf4888b179ac6c493836b83769e87bBill Wendling/// instruction.
14236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesstatic bool callClobbersAnyYmmReg(MachineInstr *MI) {
14336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  assert(MI->isCall() && "Can only be called on call instructions.");
1442990853ea8bf4888b179ac6c493836b83769e87bBill Wendling  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
1452990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    const MachineOperand &MO = MI->getOperand(i);
1462990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    if (!MO.isRegMask())
1472990853ea8bf4888b179ac6c493836b83769e87bBill Wendling      continue;
14836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
1492990853ea8bf4888b179ac6c493836b83769e87bBill Wendling      if (MO.clobbersPhysReg(reg))
1502990853ea8bf4888b179ac6c493836b83769e87bBill Wendling        return true;
1512990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    }
1522990853ea8bf4888b179ac6c493836b83769e87bBill Wendling  }
1532990853ea8bf4888b179ac6c493836b83769e87bBill Wendling  return false;
1542990853ea8bf4888b179ac6c493836b83769e87bBill Wendling}
1552990853ea8bf4888b179ac6c493836b83769e87bBill Wendling
15636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// Insert a vzeroupper instruction before I.
15736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesvoid VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
15836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines                                              MachineBasicBlock &MBB) {
15936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  DebugLoc dl = I->getDebugLoc();
16036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
16136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  ++NumVZU;
16236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  EverMadeChange = true;
16336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines}
1643bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
16536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines// Add MBB to the DirtySuccessors list if it hasn't already been added.
16636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesvoid VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
16736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
16836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    DirtySuccessors.push_back(&MBB);
16936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
1703bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes  }
1713bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes}
1723bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
1733bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes/// processBasicBlock - Loop over all of the instructions in the basic block,
1743bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes/// inserting vzero upper instructions before function calls.
17536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesvoid VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
176bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
17736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // Start by assuming that the block PASS_THROUGH, which implies no unguarded
17836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // calls.
17936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  BlockExitState CurState = PASS_THROUGH;
18036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
181bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
18236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
183531f025361555e7a695eb559ec02645c054ee146Michael Liao    MachineInstr *MI = I;
1845a96b3dad2f634c9081c8b2b6c2575441dc5a2bdEvan Cheng    bool isControlFlow = MI->isCall() || MI->isReturn();
185bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
186a20e1e7ef596842127794372244fd5c646f71296Chad Rosier    // Shortcut: don't need to check regular instructions in dirty state.
18736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (!isControlFlow && CurState == EXITS_DIRTY)
188bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      continue;
189bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
190bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    if (hasYmmReg(MI)) {
191bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      // We found a ymm-using instruction; this could be an AVX instruction,
192bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      // or it could be control flow.
19336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      CurState = EXITS_DIRTY;
194bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      continue;
195bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    }
1963bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
197bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // Check for control-flow out of the current function (which might
198bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // indirectly execute SSE instructions).
199bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    if (!isControlFlow)
200bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      continue;
201bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
2022990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    // If the call won't clobber any YMM register, skip it as well. It usually
2032990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    // happens on helper function calls (such as '_chkstk', '_ftol2') where
2042990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    // standard calling convention is not used (RegMask is not used to mark
2052990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    // register clobbered and register usage (def/imp-def/use) is well-dfined
2062990853ea8bf4888b179ac6c493836b83769e87bBill Wendling    // and explicitly specified.
20736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (MI->isCall() && !callClobbersAnyYmmReg(MI))
2082990853ea8bf4888b179ac6c493836b83769e87bBill Wendling      continue;
2092990853ea8bf4888b179ac6c493836b83769e87bBill Wendling
210bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
211bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // registers. This instruction has zero latency. In addition, the processor
212bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // changes back to Clean state, after which execution of Intel SSE
213bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // instructions or Intel AVX instructions has no transition penalty. Add
214bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // the VZEROUPPER instruction before any function call/return that might
215bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // execute SSE code.
216bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // FIXME: In some cases, we may want to move the VZEROUPPER into a
217bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman    // predecessor block.
21836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (CurState == EXITS_DIRTY) {
219bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      // After the inserted VZEROUPPER the state becomes clean again, but
220bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      // other YMM may appear before other subsequent calls or even before
221bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman      // the end of the BB.
22236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      insertVZeroUpper(I, MBB);
22336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      CurState = EXITS_CLEAN;
22436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    } else if (CurState == PASS_THROUGH) {
22536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      // If this block is currently in pass-through state and we encounter a
22636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      // call then whether we need a vzeroupper or not depends on whether this
22736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      // block has successors that exit dirty. Record the location of the call,
22836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
22936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      // It will be inserted later if necessary.
23036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
23136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      CurState = EXITS_CLEAN;
2323bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes    }
2333bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes  }
2343bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes
23536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
23636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines               << getBlockExitStateName(CurState) << '\n');
23736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
23836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  if (CurState == EXITS_DIRTY)
23936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
24036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines                                          SE = MBB.succ_end();
24136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines         SI != SE; ++SI)
24236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      addDirtySuccessor(**SI);
24336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
24436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  BlockStates[MBB.getNumber()].ExitState = CurState;
24536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines}
24636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
24736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines/// runOnMachineFunction - Loop over all of the basic blocks, inserting
24836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines/// vzero upper instructions before function calls.
24936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hinesbool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
250dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines  const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
251dce4a407a24b04eebc6a376f8e62b41aaa7b071fStephen Hines  if (!ST.hasAVX() || ST.hasAVX512())
25236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    return false;
25336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  TII = MF.getTarget().getInstrInfo();
25436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  MachineRegisterInfo &MRI = MF.getRegInfo();
25536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  EverMadeChange = false;
256bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
25736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // Fast check: if the function doesn't use any ymm registers, we don't need
25836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
25936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // cheap in the common case of no ymm use.
26036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  bool YMMUsed = false;
26136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  const TargetRegisterClass *RC = &X86::VR256RegClass;
26236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
26336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines       i != e; i++) {
26436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (!MRI.reg_nodbg_empty(*i)) {
26536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      YMMUsed = true;
26636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      break;
26736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    }
26836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  }
26936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  if (!YMMUsed) {
27036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    return false;
27136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  }
272bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
27336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  assert(BlockStates.empty() && DirtySuccessors.empty() &&
27436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines         "X86VZeroUpper state should be clear");
27536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  BlockStates.resize(MF.getNumBlockIDs());
27636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
27736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // Process all blocks. This will compute block exit states, record the first
27836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // unguarded call in each block, and add successors of dirty blocks to the
27936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // DirtySuccessors list.
28036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
28136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    processBasicBlock(*I);
28236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
28336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // If any YMM regs are live in to this function, add the entry block to the
28436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // DirtySuccessors list
28536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  if (checkFnHasLiveInYmm(MRI))
28636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    addDirtySuccessor(MF.front());
28736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
28836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
28936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
29036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  // through PASS_THROUGH blocks.
29136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  while (!DirtySuccessors.empty()) {
29236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    MachineBasicBlock &MBB = *DirtySuccessors.back();
29336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    DirtySuccessors.pop_back();
29436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    BlockState &BBState = BlockStates[MBB.getNumber()];
29536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
29636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // MBB is a successor of a dirty block, so its first call needs to be
29736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // guarded.
29836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (BBState.FirstUnguardedCall != MBB.end())
29936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
30036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines
30136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // If this successor was a pass-through block then it is now dirty, and its
30236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // successors need to be added to the worklist (if they haven't been
30336b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    // already).
30436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    if (BBState.ExitState == PASS_THROUGH) {
30536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      DEBUG(dbgs() << "MBB #" << MBB.getNumber()
30636b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines                   << " was Pass-through, is now Dirty-out.\n");
30736b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines      for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
30836b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines                                            SE = MBB.succ_end();
30936b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines           SI != SE; ++SI)
31036b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines        addDirtySuccessor(**SI);
31136b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines    }
31236b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  }
313bd00a934c653fb1666fa7d18267644b4e9d14e5eEli Friedman
31436b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  BlockStates.clear();
31536b56886974eae4f9c5ebc96befd3e7bfe5de338Stephen Hines  return EverMadeChange;
3163bde6fe0df05558b89e7edfe48ac05da59beb81aBruno Cardoso Lopes}
317