This patch adds the X86FixupLEAs pass, which will reduce instruction

latency for certain models of the Intel Atom family, by converting
instructions into their equivalent LEA instructions, when it is both
useful and possible to do so.




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180573 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Preston Gurd
2013-04-25 20:29:37 +00:00
parent 975b1ddf60
commit d6ac8e9a03
11 changed files with 444 additions and 1 deletions

View File

@@ -33,6 +33,7 @@ set(sources
X86TargetObjectFile.cpp
X86TargetTransformInfo.cpp
X86VZeroUpper.cpp
X86FixupLEAs.cpp
)
if( CMAKE_CL_64 )

View File

@@ -69,6 +69,11 @@ ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
/// createX86PadShortFunctions - Return a pass that pads short functions
/// with NOOPs. This will prevent a stall when returning on the Atom.
FunctionPass *createX86PadShortFunctions();
/// createX86FixupLEAs - Return a a pass that selectively replaces
/// certain instructions (like add, sub, inc, dec, some shifts,
/// and some multiplies) by equivalent LEA instructions, in order
/// to eliminate execution delays in some Atom processors.
FunctionPass *createX86FixupLEAs();
} // End llvm namespace

View File

@@ -139,6 +139,8 @@ def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
"CallRegIndirect", "true",
"Call register indirect">;
def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -188,6 +190,7 @@ def : ProcessorModel<"atom", AtomModel,
FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
FeatureSlowDivide,
FeatureCallRegIndirect,
FeatureLEAUsesAG,
FeaturePadShortFunctions]>;
// "Arrandale" along with corei3 and corei5

View File

@@ -0,0 +1,251 @@
//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the pass which will find instructions which
// can be re-written as LEA instructions in order to reduce pipeline
// delays for some models of the Intel Atom family.
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "x86-fixup-LEAs"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
static char ID;
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
virtual const char *getPassName() const { return "X86 Atom LEA Fixup";}
void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI);
void processInstruction(MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI);
RegUsageState usesRegister(MachineOperand& p,
MachineBasicBlock::iterator I);
MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI);
MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const;
public:
FixupLEAPass() : MachineFunctionPass(ID) {}
virtual bool runOnMachineFunction(MachineFunction &MF);
private:
MachineFunction *MF;
const TargetMachine *TM;
const TargetInstrInfo *TII; // Machine instruction info.
LiveVariables *LV;
};
char FixupLEAPass::ID = 0;
}
/// postRAConvertToLEA - if an instruction can be converted to an
/// equivalent LEA, insert the new instruction into the basic block
/// and return a pointer to it. Otherwise, return zero.
MachineInstr *
FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI,
LiveVariables *LV) const {
MachineInstr* MI = MBBI;
MachineInstr* NewMI;
switch (MI->getOpcode()) {
case X86::MOV32rr:
case X86::MOV64rr: {
const MachineOperand& Src = MI->getOperand(1);
const MachineOperand& Dest = MI->getOperand(0);
NewMI = BuildMI(*MF, MI->getDebugLoc(),
TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
.addOperand(Dest)
.addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
MFI->insert(MBBI, NewMI); // Insert the new inst
return NewMI;
}
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
case X86::ADD64ri8_DB:
case X86::ADD32ri:
case X86::ADD32ri8:
case X86::ADD32ri_DB:
case X86::ADD32ri8_DB:
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
if (!MI->getOperand(2).isImm()) {
// convertToThreeAddress will call getImm()
// which requires isImm() to be true
return 0;
}
}
return TII->convertToThreeAddress(MFI, MBBI, LV);
}
FunctionPass *llvm::createX86FixupLEAs() {
return new FixupLEAPass();
}
/// runOnMachineFunction - Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
/// if needed and when possible.
bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
TII = Func.getTarget().getInstrInfo();
TM = &MF->getTarget();
LV = getAnalysisIfAvailable<LiveVariables>();
DEBUG(dbgs() << "Start X86FixupLEAs\n";);
// Process all basic blocks.
for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
processBasicBlock(Func, I);
DEBUG(dbgs() << "End X86FixupLEAs\n";);
return true;
}
/// usesRegister - Determine if an instruction references a machine register
/// and, if so, whether it reads or writes the register.
FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
MachineBasicBlock::iterator I) {
RegUsageState RegUsage = RU_NotUsed;
MachineInstr* MI = I;
for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
MachineOperand& opnd = MI->getOperand(i);
if (opnd.isReg() && opnd.getReg() == p.getReg()){
if (opnd.isDef())
return RU_Write;
RegUsage = RU_Read;
}
}
return RegUsage;
}
/// getPreviousInstr - Given a reference to an instruction in a basic
/// block, return a reference to the previous instruction in the block,
/// wrapping around to the last instruction of the block if the block
/// branches to itself.
static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI) {
if (I == MFI->begin()) {
if (MFI->isPredecessor(MFI)) {
I = --MFI->end();
return true;
}
else
return false;
}
--I;
return true;
}
/// searchBackwards - Step backwards through a basic block, looking
/// for an instruction which writes a register within
/// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI) {
int InstrDistance = 1;
MachineBasicBlock::iterator CurInst;
static const int INSTR_DISTANCE_THRESHOLD = 5;
CurInst = I;
bool Found;
Found = getPreviousInstr(CurInst, MFI);
while( Found && I != CurInst) {
if (CurInst->isCall() || CurInst->isInlineAsm())
break;
if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
break; // too far back to make a difference
if (usesRegister(p, CurInst) == RU_Write){
return CurInst;
}
InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
Found = getPreviousInstr(CurInst, MFI);
}
return 0;
}
/// processInstruction - Given a memory access or LEA instruction
/// whose address mode uses a base and/or index register, look for
/// an opportunity to replace the instruction which sets the base or index
/// register with an equivalent LEA instruction.
void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI) {
// Process a load, store, or LEA instruction.
MachineInstr *MI = I;
int opcode = MI->getOpcode();
const MCInstrDesc& Desc = MI->getDesc();
int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
if (AddrOffset >= 0) {
AddrOffset += X86II::getOperandBias(Desc);
MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
if (p.isReg() && p.getReg() != X86::ESP) {
seekLEAFixup(p, I, MFI);
}
MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
if (q.isReg() && q.getReg() != X86::ESP) {
seekLEAFixup(q, I, MFI);
}
}
}
/// seekLEAFixup - Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
/// try to replace it with an equivalent LEA instruction.
/// If replacement succeeds, then also process the the newly created
/// instruction.
void FixupLEAPass::seekLEAFixup(MachineOperand& p,
MachineBasicBlock::iterator& I,
MachineFunction::iterator MFI) {
MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
if (MBI) {
MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI, LV);
if (NewMI) {
++NumLEAs;
DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
MFI->erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator> (NewMI);
processInstruction(J, MFI);
}
}
}
/// processBasicBlock - Loop over all of the instructions in the basic block,
/// replacing adds and shifts with LEA instructions, where appropriate.
bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
MachineFunction::iterator MFI) {
for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
processInstruction(I, MFI);
return false;
}

View File

@@ -467,6 +467,7 @@ void X86Subtarget::initializeEnvironment() {
PostRAScheduler = false;
PadShortFunctions = false;
CallRegIndirect = false;
LEAUsesAG = false;
stackAlignment = 4;
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;

View File

@@ -165,6 +165,9 @@ protected:
/// CallRegIndirect - True if the Calls with memory reference should be converted
/// to a register-based indirect call.
bool CallRegIndirect;
/// LEAUsesAG - True if the LEA instruction inputs have to be ready at
/// address generation (AG) time.
bool LEAUsesAG;
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
@@ -278,6 +281,7 @@ public:
bool hasSlowDivide() const { return HasSlowDivide; }
bool padShortFunctions() const { return PadShortFunctions; }
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }

View File

@@ -215,6 +215,11 @@ bool X86PassConfig::addPreEmitPass() {
addPass(createX86PadShortFunctions());
ShouldPrint = true;
}
if (getOptLevel() != CodeGenOpt::None &&
getX86Subtarget().LEAusesAG()){
addPass(createX86FixupLEAs());
ShouldPrint = true;
}
return ShouldPrint;
}

View File

@@ -0,0 +1,38 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
; CHECK: addl
; CHECK-NEXT:leal
; CHECK-NEXT:decl
; CHECK-NEXT:jne
; Test for the FixupLEAs pre-emit pass. An LEA should be substituted for the ADD
; that increments the array pointer because it is within 5 instructions of the
; corresponding load. The ADD precedes the load by following the loop back edge.
; Original C code
;int test(int n, int * array)
;{
; int sum = 0;
; for(int i = 0; i < n; i++)
; sum += array[i];
; return sum;
;}
define i32 @test(i32 %n, i32* nocapture %array) {
entry:
%cmp4 = icmp sgt i32 %n, 0
br i1 %cmp4, label %for.body, label %for.end
for.body:
%i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32* %array, i32 %i.06
%0 = load i32* %arrayidx, align 4
%add = add nsw i32 %0, %sum.05
%inc = add nsw i32 %i.06, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.end, label %for.body
for.end:
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
ret i32 %sum.0.lcssa
}

View File

@@ -0,0 +1,84 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
; CHECK:BB#5
; CHECK-NEXT:leal
; CHECK-NEXT:leal
; CHECK-NEXT:leal
; CHECK-NEXT:movl
; Test for fixup lea pre-emit pass. LEA instructions should be substituted for
; ADD instructions which compute the address and index of the load because they
; precede the load within 5 instructions. An LEA should also be substituted for
; an ADD which computes part of the index because it precedes the index LEA
; within 5 instructions, this substitution is referred to as backwards chaining.
; Original C Code
;struct node_t
;{
; int k, m, n, p;
; int * array;
;};
;extern struct node_t getnode();
;int test()
;{
; int sum = 0;
; struct node_t n = getnode();
; if(n.array != 0 && n.p > 0 && n.k > 0 && n.n > 0 && n.m > 0) {
; sum = ((int*)((int)n.array + n.p) )[ n.k + n.m + n.n ];
; }
; return sum;
;}
%struct.node_t = type { i32, i32, i32, i32, i32* }
define i32 @test() {
entry:
%n = alloca %struct.node_t, align 4
call void bitcast (void (%struct.node_t*, ...)* @getnode to void (%struct.node_t*)*)(%struct.node_t* sret %n)
%array = getelementptr inbounds %struct.node_t* %n, i32 0, i32 4
%0 = load i32** %array, align 4
%cmp = icmp eq i32* %0, null
br i1 %cmp, label %if.end, label %land.lhs.true
land.lhs.true:
%p = getelementptr inbounds %struct.node_t* %n, i32 0, i32 3
%1 = load i32* %p, align 4
%cmp1 = icmp sgt i32 %1, 0
br i1 %cmp1, label %land.lhs.true2, label %if.end
land.lhs.true2:
%k = getelementptr inbounds %struct.node_t* %n, i32 0, i32 0
%2 = load i32* %k, align 4
%cmp3 = icmp sgt i32 %2, 0
br i1 %cmp3, label %land.lhs.true4, label %if.end
land.lhs.true4:
%n5 = getelementptr inbounds %struct.node_t* %n, i32 0, i32 2
%3 = load i32* %n5, align 4
%cmp6 = icmp sgt i32 %3, 0
br i1 %cmp6, label %land.lhs.true7, label %if.end
land.lhs.true7:
%m = getelementptr inbounds %struct.node_t* %n, i32 0, i32 1
%4 = load i32* %m, align 4
%cmp8 = icmp sgt i32 %4, 0
br i1 %cmp8, label %if.then, label %if.end
if.then:
%add = add i32 %3, %2
%add12 = add i32 %add, %4
%5 = ptrtoint i32* %0 to i32
%add15 = add nsw i32 %1, %5
%6 = inttoptr i32 %add15 to i32*
%arrayidx = getelementptr inbounds i32* %6, i32 %add12
%7 = load i32* %arrayidx, align 4
br label %if.end
if.end:
%sum.0 = phi i32 [ %7, %if.then ], [ 0, %land.lhs.true7 ], [ 0, %land.lhs.true4 ], [ 0, %land.lhs.true2 ], [ 0, %land.lhs.true ], [ 0, %entry ]
ret i32 %sum.0
}
declare void @getnode(%struct.node_t* sret, ...)

View File

@@ -0,0 +1,51 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
; CHECK: addl ([[reg:%[a-z]+]])
; CHECK-NEXT: addl $4, [[reg]]
; Test for the FixupLEAs pre-emit pass.
; An LEA should NOT be substituted for the ADD instruction
; that increments the array pointer if it is greater than 5 instructions
; away from the memory reference that uses it.
; Original C code: clang -m32 -S -O2
;int test(int n, int * array, int * m, int * array2)
;{
; int i, j = 0;
; int sum = 0;
; for (i = 0, j = 0; i < n;) {
; ++i;
; *m += array2[j++];
; sum += array[i];
; }
; return sum;
;}
define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%.pre = load i32* %m, align 4
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph
%0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %add, %for.body ]
%sum.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ]
%j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ]
%inc1 = add nsw i32 %j.09, 1
%arrayidx = getelementptr inbounds i32* %array2, i32 %j.09
%1 = load i32* %arrayidx, align 4
%add = add nsw i32 %0, %1
store i32 %add, i32* %m, align 4
%arrayidx2 = getelementptr inbounds i32* %array, i32 %inc1
%2 = load i32* %arrayidx2, align 4
%add3 = add nsw i32 %2, %sum.010
%exitcond = icmp eq i32 %inc1, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
ret i32 %sum.0.lcssa
}

View File

@@ -17,7 +17,7 @@
; ATOM-NEXT: movsd A(,%rax,8)
; ATOM-NEXT: mulsd
; ATOM-NEXT: movsd
; ATOM-NEXT: incq %rax
; ATOM-NEXT: leaq 1(%rax), %rax
@A = external global [0 x double]