diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 39584942468..9bb54a826bd 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -14,14 +14,16 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-codegen" +#define DEBUG_TYPE "x86-vzeroupper" #include "X86.h" #include "X86InstrInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -41,6 +43,60 @@ namespace { private: const TargetInstrInfo *TII; // Machine instruction info. MachineBasicBlock *MBB; // Current basic block + + // Any YMM register live-in to this function? + bool FnHasLiveInYmm; + + // BBState - Contains the state of each MBB: unknown, clean, dirty + SmallVector BBState; + + // BBSolved - Keep track of all MBB which had been already analyzed + // and there is no further processing required. + BitVector BBSolved; + + // Machine Basic Blocks are classified according this pass: + // + // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state + // until the MBB exit there isn't a instruction using YMM to change + // the state to dirty, or one of the incoming predecessors is unknown + // and there's not a dirty predecessor between them. + // + // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have + // instructions using YMM and be marked ST_CLEAN, as long as the state + // is cleaned by a vzeroupper before any call. + // + // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a + // vzeroupper instruction. + // + // ST_INIT - Placeholder for an empty state set + // + enum { + ST_UNKNOWN = 0, + ST_CLEAN = 1, + ST_DIRTY = 2, + ST_INIT = 3 + }; + + // computeState - Given two states, compute the resulting state, in + // the following way + // + // 1) One dirty state yields another dirty state + // 2) All states must be clean for the result to be clean + // 3) If none above and one unknown, the result state is also unknown + // + unsigned computeState(unsigned PrevState, unsigned CurState) { + if (PrevState == ST_INIT) + return CurState; + + if (PrevState == ST_DIRTY || CurState == ST_DIRTY) + return ST_DIRTY; + + if (PrevState == ST_CLEAN && CurState == ST_CLEAN) + return ST_CLEAN; + + return ST_UNKNOWN; + } + }; char VZeroUpperInserter::ID = 0; } @@ -49,37 +105,82 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); } +static bool isYmmReg(unsigned Reg) { + if (Reg >= X86::YMM0 && Reg <= X86::YMM15) + return true; + + return false; +} + +static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { + for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), + E = MRI.livein_end(); I != E; ++I) + if (isYmmReg(I->first)) + return true; + + return false; +} + +static bool hasYmmReg(MachineInstr *MI) { + for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.isDebug()) + continue; + if (isYmmReg(MO.getReg())) + return true; + } + return false; +} + /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// vzero upper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { TII = MF.getTarget().getInstrInfo(); - bool Changed = false; + MachineRegisterInfo &MRI = MF.getRegInfo(); + bool EverMadeChange = false; - // Process any unreachable blocks in arbitrary order now. - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - Changed |= processBasicBlock(MF, *BB); - - return Changed; -} - -static bool isCallToModuleFn(const MachineInstr *MI) { - assert(MI->getDesc().isCall() && "Isn't a call instruction"); - - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - - if (!MO.isGlobal()) - continue; - - const GlobalValue *GV = MO.getGlobal(); - GlobalValue::LinkageTypes LT = GV->getLinkage(); - if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) || - (GV->isExternalLinkage(LT) && !GV->isDeclaration())) - return true; - - return false; + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = false; + TargetRegisterClass *RC = X86::VR256RegisterClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); + i != e; i++) { + if (MRI.isPhysRegUsed(*i)) { + YMMUsed = true; + break; + } } - return false; + if (!YMMUsed) + return EverMadeChange; + + // Pre-compute the existence of any live-in YMM registers to this function + FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); + + assert(BBState.empty()); + BBState.resize(MF.getNumBlockIDs(), 0); + BBSolved.resize(MF.getNumBlockIDs(), 0); + + // Each BB state depends on all predecessors, loop over until everything + // converges. (Once we converge, we can implicitly mark everything that is + // still ST_UNKNOWN as ST_CLEAN.) + while (1) { + bool MadeChange = false; + + // Process all basic blocks. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + MadeChange |= processBasicBlock(MF, *I); + + // If this iteration over the code changed anything, keep iterating. + if (!MadeChange) break; + EverMadeChange = true; + } + + BBState.clear(); + BBSolved.clear(); + return EverMadeChange; } /// processBasicBlock - Loop over all of the instructions in the basic block, @@ -87,19 +188,98 @@ static bool isCallToModuleFn(const MachineInstr *MI) { bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; + unsigned BBNum = BB.getNumber(); MBB = &BB; + // Don't process already solved BBs + if (BBSolved[BBNum]) + return false; // No changes + + // Check the state of all predecessors + unsigned EntryState = ST_INIT; + for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), + PE = BB.pred_end(); PI != PE; ++PI) { + EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); + if (EntryState == ST_DIRTY) + break; + } + + + // The entry MBB for the function may set the inital state to dirty if + // the function receives any YMM incoming arguments + if (MBB == MF.begin()) { + EntryState = ST_CLEAN; + if (FnHasLiveInYmm) + EntryState = ST_DIRTY; + } + + // The current state is initialized according to the predecessors + unsigned CurState = EntryState; + bool BBHasCall = false; + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { MachineInstr *MI = I; DebugLoc dl = I->getDebugLoc(); + bool isControlFlow = MI->getDesc().isCall() || MI->getDesc().isReturn(); - // Insert a vzeroupper instruction before each control transfer - // to functions outside this module - if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) { - BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); - ++NumVZU; + // Shortcut: don't need to check regular instructions in dirty state. + if (!isControlFlow && CurState == ST_DIRTY) + continue; + + if (hasYmmReg(MI)) { + // We found a ymm-using instruction; this could be an AVX instruction, + // or it could be control flow. + CurState = ST_DIRTY; + continue; + } + + // Check for control-flow out of the current function (which might + // indirectly execute SSE instructions). + if (!isControlFlow) + continue; + + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX + // registers. This instruction has zero latency. In addition, the processor + // changes back to Clean state, after which execution of Intel SSE + // instructions or Intel AVX instructions has no transition penalty. Add + // the VZEROUPPER instruction before any function call/return that might + // execute SSE code. + // FIXME: In some cases, we may want to move the VZEROUPPER into a + // predecessor block. + if (CurState == ST_DIRTY) { + // Only insert the VZEROUPPER in case the entry state isn't unknown. + // When unknown, only compute the information within the block to have + // it available in the exit if possible, but don't change the block. + if (EntryState != ST_UNKNOWN) { + BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + } + + // After the inserted VZEROUPPER the state becomes clean again, but + // other YMM may appear before other subsequent calls or even before + // the end of the BB. + CurState = ST_CLEAN; } } + DEBUG(dbgs() << "MBB #" << BBNum + << ", current state: " << CurState << '\n'); + + // A BB can only be considered solved when we both have done all the + // necessary transformations, and have computed the exit state. This happens + // in two cases: + // 1) We know the entry state: this immediately implies the exit state and + // all the necessary transformations. + // 2) There are no calls, and and a non-call instruction marks this block: + // no transformations are necessary, and we know the exit state. + if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) + BBSolved[BBNum] = true; + + if (CurState != BBState[BBNum]) + Changed = true; + + BBState[BBNum] = CurState; return Changed; } diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll index eaf236c6c77..bf4ab5be151 100644 --- a/test/CodeGen/X86/avx-vzeroupper.ll +++ b/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,26 +1,83 @@ ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -define <4 x float> @do_sse_local(<4 x float> %a) nounwind uwtable readnone ssp { -entry: - %add.i = fadd <4 x float> %a, %a - ret <4 x float> %add.i -} +declare <4 x float> @do_sse(<4 x float>) +declare <8 x float> @do_avx(<8 x float>) +declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone +@x = common global <4 x float> zeroinitializer, align 16 +@g = common global <8 x float> zeroinitializer, align 32 + +;; Basic checking - don't emit any vzeroupper instruction ; CHECK: _test00 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { entry: - %add.i = fadd <4 x float> %a, %b - ; CHECK: vzeroupper - ; CHECK-NEXT: callq _do_sse - %call3 = tail call <4 x float> @do_sse(<4 x float> %add.i) nounwind - %sub.i = fsub <4 x float> %call3, %add.i ; CHECK-NOT: vzeroupper - ; CHECK: callq _do_sse_local - %call8 = tail call <4 x float> @do_sse_local(<4 x float> %sub.i) - ; CHECK: vzeroupper - ; CHECK-NEXT: jmp _do_sse - %call10 = tail call <4 x float> @do_sse(<4 x float> %call8) nounwind - ret <4 x float> %call10 + %add.i = fadd <4 x float> %a, %b + %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind + ; CHECK: ret + ret <4 x float> %call3 } -declare <4 x float> @do_sse(<4 x float>) +;; Check parameter 256-bit parameter passing + +; CHECK: _test01 +define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp { +entry: + %tmp = load <4 x float>* @x, align 16 + ; CHECK: vzeroupper + ; CHECK-NEXT: callq _do_sse + %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind + store <4 x float> %call, <4 x float>* @x, align 16 + ; CHECK-NOT: vzeroupper + ; CHECK: callq _do_sse + %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind + store <4 x float> %call2, <4 x float>* @x, align 16 + ; CHECK: ret + ret <8 x float> %c +} + +;; Test the pass convergence and also that vzeroupper is only issued when necessary, +;; for this function it should be only once + +; CHECK: _test02 +define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +entry: + %add.i = fadd <4 x float> %a, %b + br label %for.body + +for.body: ; preds = %for.body, %entry + ; CHECK: LBB + ; CHECK-NOT: vzeroupper + %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ] + %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ] + ; CHECK: callq _do_sse + %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind + ; CHECK-NEXT: callq _do_sse + %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind + %tmp11 = load <8 x float>* @g, align 32 + %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind + ; CHECK: vzeroupper + ; CHECK-NEXT: callq _do_sse + %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind + %1 = add nsw i32 %i.018, 1 + %exitcond = icmp eq i32 %1, 4 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret <4 x float> %call14 +} + +;; Check that we also perform vzeroupper when we return from a function. + +; CHECK: _test03 +define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +entry: + %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> + ; CHECK-NOT: vzeroupper + ; CHECK: call + %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind + %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> + ; CHECK: vzeroupper + ; CHECK: ret + ret <4 x float> %shuf2 +}