From 3dd951e842c90476d22f071cae2a6848790ad4cc Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 17 Mar 2014 01:22:54 +0000 Subject: [PATCH] [X86] New and improved VZeroUpperInserter optimization. - Adds support for inserting vzerouppers before tail-calls. This is enabled implicitly by having MachineInstr::copyImplicitOps preserve regmask operands, which allows VZeroUpperInserter to see where tail-calls use vector registers. - Fixes a bug that caused the previous version of this optimization to miss some vzeroupper insertion points in loops. (Loops-with-vector-code that followed loops-without-vector-code were mistakenly overlooked by the previous version). - New algorithm never revisits instructions. Fixes git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204021 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachineInstr.cpp | 2 +- lib/Target/X86/X86VZeroUpper.cpp | 325 ++++++++++++++--------------- test/CodeGen/X86/avx-vzeroupper.ll | 37 +++- 3 files changed, 190 insertions(+), 174 deletions(-) diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index bbb5cd203b1..9b9b7f089c0 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -1434,7 +1434,7 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF, for (unsigned i = MI->getDesc().getNumOperands(), e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); - if (MO.isReg() && MO.isImplicit()) + if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) addOperand(MF, MO); } } diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 9600138dc04..d4341b9fc35 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -31,73 +31,59 @@ using namespace llvm; STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); namespace { - struct VZeroUpperInserter : public MachineFunctionPass { - static char ID; + + class VZeroUpperInserter : public MachineFunctionPass { + public: + VZeroUpperInserter() : MachineFunctionPass(ID) {} - bool runOnMachineFunction(MachineFunction &MF) override; - - bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); - const char *getPassName() const override {return "X86 vzeroupper inserter";} private: - const TargetInstrInfo *TII; // Machine instruction info. - // Any YMM register live-in to this function? - bool FnHasLiveInYmm; + void processBasicBlock(MachineBasicBlock &MBB); + void insertVZeroUpper(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB); + void addDirtySuccessor(MachineBasicBlock &MBB); - // BBState - Contains the state of each MBB: unknown, clean, dirty - SmallVector BBState; + typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; + static const char* getBlockExitStateName(BlockExitState ST); - // BBSolved - Keep track of all MBB which had been already analyzed - // and there is no further processing required. - BitVector BBSolved; - - // Machine Basic Blocks are classified according this pass: + // Core algorithm state: + // BlockState - Each block is either: + // - PASS_THROUGH: There are neither YMM dirtying instructions nor + // vzeroupper instructions in this block. + // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this + // block that will ensure that YMM is clean on exit. + // - EXITS_DIRTY: An instruction in the block dirties YMM and no + // subsequent vzeroupper in the block clears it. // - // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state - // until the MBB exit there isn't a instruction using YMM to change - // the state to dirty, or one of the incoming predecessors is unknown - // and there's not a dirty predecessor between them. + // AddedToDirtySuccessors - This flag is raised when a block is added to the + // DirtySuccessors list to ensure that it's not + // added multiple times. // - // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have - // instructions using YMM and be marked ST_CLEAN, as long as the state - // is cleaned by a vzeroupper before any call. - // - // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a - // vzeroupper instruction. - // - // ST_INIT - Placeholder for an empty state set - // - enum { - ST_UNKNOWN = 0, - ST_CLEAN = 1, - ST_DIRTY = 2, - ST_INIT = 3 + // FirstUnguardedCall - Records the location of the first unguarded call in + // each basic block that may need to be guarded by a + // vzeroupper. We won't know whether it actually needs + // to be guarded until we discover a predecessor that + // is DIRTY_OUT. + struct BlockState { + BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} + BlockExitState ExitState; + bool AddedToDirtySuccessors; + MachineBasicBlock::iterator FirstUnguardedCall; }; + typedef SmallVector BlockStateMap; + typedef SmallVector DirtySuccessorsWorkList; - // computeState - Given two states, compute the resulting state, in - // the following way - // - // 1) One dirty state yields another dirty state - // 2) All states must be clean for the result to be clean - // 3) If none above and one unknown, the result state is also unknown - // - static unsigned computeState(unsigned PrevState, unsigned CurState) { - if (PrevState == ST_INIT) - return CurState; - - if (PrevState == ST_DIRTY || CurState == ST_DIRTY) - return ST_DIRTY; - - if (PrevState == ST_CLEAN && CurState == ST_CLEAN) - return ST_CLEAN; - - return ST_UNKNOWN; - } + BlockStateMap BlockStates; + DirtySuccessorsWorkList DirtySuccessors; + bool EverMadeChange; + const TargetInstrInfo *TII; + static char ID; }; + char VZeroUpperInserter::ID = 0; } @@ -105,6 +91,15 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); } +const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { + switch (ST) { + case PASS_THROUGH: return "Pass-through"; + case EXITS_DIRTY: return "Exits-dirty"; + case EXITS_CLEAN: return "Exits-clean"; + } + llvm_unreachable("Invalid block exit state."); +} + static bool isYmmReg(unsigned Reg) { return (Reg >= X86::YMM0 && Reg <= X86::YMM15); } @@ -143,7 +138,8 @@ static bool hasYmmReg(MachineInstr *MI) { /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this /// instruction. -static bool clobbersAnyYmmReg(MachineInstr *MI) { +static bool callClobbersAnyYmmReg(MachineInstr *MI) { + assert(MI->isCall() && "Can only be called on call instructions."); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isRegMask()) @@ -156,104 +152,44 @@ static bool clobbersAnyYmmReg(MachineInstr *MI) { return false; } -/// runOnMachineFunction - Loop over all of the basic blocks, inserting -/// vzero upper instructions before function calls. -bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { - if (MF.getTarget().getSubtarget().hasAVX512()) - return false; - TII = MF.getTarget().getInstrInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - bool EverMadeChange = false; +// Insert a vzeroupper instruction before I. +void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB) { + DebugLoc dl = I->getDebugLoc(); + BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + EverMadeChange = true; +} - // Fast check: if the function doesn't use any ymm registers, we don't need - // to insert any VZEROUPPER instructions. This is constant-time, so it is - // cheap in the common case of no ymm use. - bool YMMUsed = false; - const TargetRegisterClass *RC = &X86::VR256RegClass; - for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); - i != e; i++) { - if (!MRI.reg_nodbg_empty(*i)) { - YMMUsed = true; - break; - } +// Add MBB to the DirtySuccessors list if it hasn't already been added. +void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { + if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { + DirtySuccessors.push_back(&MBB); + BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; } - if (!YMMUsed) - return EverMadeChange; - - // Pre-compute the existence of any live-in YMM registers to this function - FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); - - assert(BBState.empty()); - BBState.resize(MF.getNumBlockIDs(), 0); - BBSolved.resize(MF.getNumBlockIDs(), 0); - - // Each BB state depends on all predecessors, loop over until everything - // converges. (Once we converge, we can implicitly mark everything that is - // still ST_UNKNOWN as ST_CLEAN.) - while (1) { - bool MadeChange = false; - - // Process all basic blocks. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - MadeChange |= processBasicBlock(MF, *I); - - // If this iteration over the code changed anything, keep iterating. - if (!MadeChange) break; - EverMadeChange = true; - } - - BBState.clear(); - BBSolved.clear(); - return EverMadeChange; } /// processBasicBlock - Loop over all of the instructions in the basic block, /// inserting vzero upper instructions before function calls. -bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, - MachineBasicBlock &BB) { - bool Changed = false; - unsigned BBNum = BB.getNumber(); +void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { - // Don't process already solved BBs - if (BBSolved[BBNum]) - return false; // No changes + // Start by assuming that the block PASS_THROUGH, which implies no unguarded + // calls. + BlockExitState CurState = PASS_THROUGH; + BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); - // Check the state of all predecessors - unsigned EntryState = ST_INIT; - for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), - PE = BB.pred_end(); PI != PE; ++PI) { - EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); - if (EntryState == ST_DIRTY) - break; - } - - - // The entry MBB for the function may set the initial state to dirty if - // the function receives any YMM incoming arguments - if (&BB == MF.begin()) { - EntryState = ST_CLEAN; - if (FnHasLiveInYmm) - EntryState = ST_DIRTY; - } - - // The current state is initialized according to the predecessors - unsigned CurState = EntryState; - bool BBHasCall = false; - - for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { - DebugLoc dl = I->getDebugLoc(); + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { MachineInstr *MI = I; - bool isControlFlow = MI->isCall() || MI->isReturn(); // Shortcut: don't need to check regular instructions in dirty state. - if (!isControlFlow && CurState == ST_DIRTY) + if (!isControlFlow && CurState == EXITS_DIRTY) continue; if (hasYmmReg(MI)) { // We found a ymm-using instruction; this could be an AVX instruction, // or it could be control flow. - CurState = ST_DIRTY; + CurState = EXITS_DIRTY; continue; } @@ -267,11 +203,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // standard calling convention is not used (RegMask is not used to mark // register clobbered and register usage (def/imp-def/use) is well-dfined // and explicitly specified. - if (MI->isCall() && !clobbersAnyYmmReg(MI)) + if (MI->isCall() && !callClobbersAnyYmmReg(MI)) continue; - BBHasCall = true; - // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX // registers. This instruction has zero latency. In addition, the processor // changes back to Clean state, after which execution of Intel SSE @@ -280,38 +214,101 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // execute SSE code. // FIXME: In some cases, we may want to move the VZEROUPPER into a // predecessor block. - if (CurState == ST_DIRTY) { - // Only insert the VZEROUPPER in case the entry state isn't unknown. - // When unknown, only compute the information within the block to have - // it available in the exit if possible, but don't change the block. - if (EntryState != ST_UNKNOWN) { - BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); - ++NumVZU; - } - + if (CurState == EXITS_DIRTY) { // After the inserted VZEROUPPER the state becomes clean again, but // other YMM may appear before other subsequent calls or even before // the end of the BB. - CurState = ST_CLEAN; + insertVZeroUpper(I, MBB); + CurState = EXITS_CLEAN; + } else if (CurState == PASS_THROUGH) { + // If this block is currently in pass-through state and we encounter a + // call then whether we need a vzeroupper or not depends on whether this + // block has successors that exit dirty. Record the location of the call, + // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. + // It will be inserted later if necessary. + BlockStates[MBB.getNumber()].FirstUnguardedCall = I; + CurState = EXITS_CLEAN; } } - DEBUG(dbgs() << "MBB #" << BBNum - << ", current state: " << CurState << '\n'); + DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " + << getBlockExitStateName(CurState) << '\n'); - // A BB can only be considered solved when we both have done all the - // necessary transformations, and have computed the exit state. This happens - // in two cases: - // 1) We know the entry state: this immediately implies the exit state and - // all the necessary transformations. - // 2) There are no calls, and and a non-call instruction marks this block: - // no transformations are necessary, and we know the exit state. - if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) - BBSolved[BBNum] = true; + if (CurState == EXITS_DIRTY) + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); + SI != SE; ++SI) + addDirtySuccessor(**SI); - if (CurState != BBState[BBNum]) - Changed = true; - - BBState[BBNum] = CurState; - return Changed; + BlockStates[MBB.getNumber()].ExitState = CurState; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// vzero upper instructions before function calls. +bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + if (MF.getTarget().getSubtarget().hasAVX512()) + return false; + TII = MF.getTarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + EverMadeChange = false; + + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = false; + const TargetRegisterClass *RC = &X86::VR256RegClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); + i != e; i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YMMUsed = true; + break; + } + } + if (!YMMUsed) { + return false; + } + + assert(BlockStates.empty() && DirtySuccessors.empty() && + "X86VZeroUpper state should be clear"); + BlockStates.resize(MF.getNumBlockIDs()); + + // Process all blocks. This will compute block exit states, record the first + // unguarded call in each block, and add successors of dirty blocks to the + // DirtySuccessors list. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + processBasicBlock(*I); + + // If any YMM regs are live in to this function, add the entry block to the + // DirtySuccessors list + if (checkFnHasLiveInYmm(MRI)) + addDirtySuccessor(MF.front()); + + // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add + // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY + // through PASS_THROUGH blocks. + while (!DirtySuccessors.empty()) { + MachineBasicBlock &MBB = *DirtySuccessors.back(); + DirtySuccessors.pop_back(); + BlockState &BBState = BlockStates[MBB.getNumber()]; + + // MBB is a successor of a dirty block, so its first call needs to be + // guarded. + if (BBState.FirstUnguardedCall != MBB.end()) + insertVZeroUpper(BBState.FirstUnguardedCall, MBB); + + // If this successor was a pass-through block then it is now dirty, and its + // successors need to be added to the worklist (if they haven't been + // already). + if (BBState.ExitState == PASS_THROUGH) { + DEBUG(dbgs() << "MBB #" << MBB.getNumber() + << " was Pass-through, is now Dirty-out.\n"); + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); + SI != SE; ++SI) + addDirtySuccessor(**SI); + } + } + + BlockStates.clear(); + return EverMadeChange; } diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll index bf4ab5be151..a2163a254e1 100644 --- a/test/CodeGen/X86/avx-vzeroupper.ll +++ b/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s +declare i32 @foo() declare <4 x float> @do_sse(<4 x float>) declare <8 x float> @do_avx(<8 x float>) declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone @@ -36,20 +37,38 @@ entry: ret <8 x float> %c } +;; Check that vzeroupper is emitted for tail calls. + +; CHECK: _test02 +define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp { +entry: + %add.i = fadd <8 x float> %a, %b + %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) + ; CHECK: vzeroupper + ; CHECK: jmp _do_sse + %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind + ret <4 x float> %call3 +} + ;; Test the pass convergence and also that vzeroupper is only issued when necessary, ;; for this function it should be only once -; CHECK: _test02 -define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +; CHECK: _test03 +define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { entry: %add.i = fadd <4 x float> %a, %b - br label %for.body + br label %while.cond -for.body: ; preds = %for.body, %entry +while.cond: + %call = tail call i32 @foo() + %tobool = icmp eq i32 %call, 0 + br i1 %tobool, label %for.body, label %while.cond + +for.body: ; CHECK: LBB ; CHECK-NOT: vzeroupper - %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ] - %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ] + %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ] + %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ] ; CHECK: callq _do_sse %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind ; CHECK-NEXT: callq _do_sse @@ -63,14 +82,14 @@ for.body: ; preds = %for.body, %entry %exitcond = icmp eq i32 %1, 4 br i1 %exitcond, label %for.end, label %for.body -for.end: ; preds = %for.body +for.end: ret <4 x float> %call14 } ;; Check that we also perform vzeroupper when we return from a function. -; CHECK: _test03 -define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +; CHECK: _test04 +define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { entry: %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> ; CHECK-NOT: vzeroupper