diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 1fd55128e22..c481eb9552c 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -32,6 +32,7 @@ set(sources X86Subtarget.cpp X86TargetMachine.cpp X86TargetObjectFile.cpp + X86VZeroUpper.cpp ) if( CMAKE_CL_64 ) diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index d1e193304a5..d480d0c8654 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -48,6 +48,11 @@ FunctionPass *createX86FloatingPointStackifierPass(); /// crossings. FunctionPass *createSSEDomainFixPass(); +/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions +/// before each call to avoid transition penalty between functions encoded with +/// AVX and SSE. +FunctionPass *createX86IssueVZeroUpperPass(); + /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code /// to the specified MCE object. FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM, diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 569c0408f30..95e7021dce5 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -16,6 +16,7 @@ #include "llvm/PassManager.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegistry.h" @@ -91,6 +92,16 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, FloatABIType = FloatABI::Hard; } +//===----------------------------------------------------------------------===// +// Command line options for x86 +//===----------------------------------------------------------------------===// +bool UseVZeroUpper; + +static cl::opt +VZeroUpper("x86-use-vzeroupper", + cl::desc("Minimize AVX to SSE transition penalty"), + cl::location(UseVZeroUpper), cl::init(false)); + //===----------------------------------------------------------------------===// // Pass Pipeline Configuration //===----------------------------------------------------------------------===// @@ -125,6 +136,11 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, PM.add(createSSEDomainFixPass()); return true; } + + if (Subtarget.hasAVX() && UseVZeroUpper) { + PM.add(createX86IssueVZeroUpperPass()); + return true; + } return false; } diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp new file mode 100644 index 00000000000..d87efc99b24 --- /dev/null +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -0,0 +1,105 @@ +//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which inserts x86 AVX vzeroupper instructions +// before calls to SSE encoded functions. This avoids transition latency +// penalty when tranfering control between AVX encoded instructions and old +// SSE encoding mode. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-codegen" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/GlobalValue.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); + +namespace { + struct VZeroUpperInserter : public MachineFunctionPass { + static char ID; + VZeroUpperInserter() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + virtual const char *getPassName() const { return "X86 vzeroupper inserter";} + + private: + const TargetInstrInfo *TII; // Machine instruction info. + MachineBasicBlock *MBB; // Current basic block + }; + char VZeroUpperInserter::ID = 0; +} + +FunctionPass *llvm::createX86IssueVZeroUpperPass() { + return new VZeroUpperInserter(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// vzero upper instructions before function calls. +bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + bool Changed = false; + + // Process any unreachable blocks in arbitrary order now. + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + Changed |= processBasicBlock(MF, *BB); + + return Changed; +} + +bool isCallToModuleFn(const MachineInstr *MI) { + assert(MI->getDesc().isCall() && "Isn't a call instruction"); + + for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + if (!MO.isGlobal()) + continue; + + const GlobalValue *GV = MO.getGlobal(); + GlobalValue::LinkageTypes LT = GV->getLinkage(); + if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) || + (GV->isExternalLinkage(LT) && !GV->isDeclaration())) + return true; + + return false; + } + return false; +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// inserting vzero upper instructions before function calls. +bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, + MachineBasicBlock &BB) { + bool Changed = false; + MBB = &BB; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + DebugLoc dl = I->getDebugLoc(); + + // Insert a vzeroupper instruction before each control transfer + // to functions outside this module + if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) { + BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + } + } + + return Changed; +} diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll new file mode 100644 index 00000000000..eaf236c6c77 --- /dev/null +++ b/test/CodeGen/X86/avx-vzeroupper.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +define <4 x float> @do_sse_local(<4 x float> %a) nounwind uwtable readnone ssp { +entry: + %add.i = fadd <4 x float> %a, %a + ret <4 x float> %add.i +} + +; CHECK: _test00 +define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +entry: + %add.i = fadd <4 x float> %a, %b + ; CHECK: vzeroupper + ; CHECK-NEXT: callq _do_sse + %call3 = tail call <4 x float> @do_sse(<4 x float> %add.i) nounwind + %sub.i = fsub <4 x float> %call3, %add.i + ; CHECK-NOT: vzeroupper + ; CHECK: callq _do_sse_local + %call8 = tail call <4 x float> @do_sse_local(<4 x float> %sub.i) + ; CHECK: vzeroupper + ; CHECK-NEXT: jmp _do_sse + %call10 = tail call <4 x float> @do_sse(<4 x float> %call8) nounwind + ret <4 x float> %call10 +} + +declare <4 x float> @do_sse(<4 x float>)