mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-05 12:31:33 +00:00
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon and P4 (by a substantial margin). For example, instead of compiling this: long long X2(long long Y) { return Y << 2; } to: X3_2: movl 4(%esp), %eax movl 8(%esp), %edx shldl $2, %eax, %edx shll $2, %eax ret Compile it to: X2: movl 4(%esp), %eax movl 8(%esp), %ecx movl %eax, %edx shrl $30, %edx leal (%edx,%ecx,4), %edx shll $2, %eax ret Likewise, for << 3, compile to: X3: movl 4(%esp), %eax movl 8(%esp), %ecx movl %eax, %edx shrl $29, %edx leal (%edx,%ecx,8), %edx shll $3, %eax ret This matches icc, except that icc open codes the shifts as adds on the P4. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@17707 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
62f5a9402c
commit
ce7cafa960
@ -314,6 +314,13 @@ namespace {
|
|||||||
Value *Op, Value *ShiftAmount, bool isLeftShift,
|
Value *Op, Value *ShiftAmount, bool isLeftShift,
|
||||||
const Type *ResultTy, unsigned DestReg);
|
const Type *ResultTy, unsigned DestReg);
|
||||||
|
|
||||||
|
// Emit code for a 'SHLD DestReg, Op0, Op1, Amt' operation, where Amt is a
|
||||||
|
// constant.
|
||||||
|
void doSHLDConst(MachineBasicBlock *MBB,
|
||||||
|
MachineBasicBlock::iterator MBBI,
|
||||||
|
unsigned DestReg, unsigned Op0Reg, unsigned Op1Reg,
|
||||||
|
unsigned Op1Val);
|
||||||
|
|
||||||
/// emitSelectOperation - Common code shared between visitSelectInst and the
|
/// emitSelectOperation - Common code shared between visitSelectInst and the
|
||||||
/// constant expression support.
|
/// constant expression support.
|
||||||
void emitSelectOperation(MachineBasicBlock *MBB,
|
void emitSelectOperation(MachineBasicBlock *MBB,
|
||||||
@ -2893,6 +2900,41 @@ void X86ISel::visitShiftInst(ShiftInst &I) {
|
|||||||
getReg (I));
|
getReg (I));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Emit code for a 'SHLD DestReg, Op0, Op1, Amt' operation, where Amt is a
|
||||||
|
/// constant.
|
||||||
|
void X86ISel::doSHLDConst(MachineBasicBlock *MBB,
|
||||||
|
MachineBasicBlock::iterator IP,
|
||||||
|
unsigned DestReg, unsigned Op0Reg, unsigned Op1Reg,
|
||||||
|
unsigned Amt) {
|
||||||
|
// SHLD is a very inefficient operation on every processor, try to do
|
||||||
|
// somethign simpler for common values of 'Amt'.
|
||||||
|
if (Amt == 0) {
|
||||||
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg).addReg(Op0Reg);
|
||||||
|
} else if (Amt == 1) {
|
||||||
|
unsigned Tmp = makeAnotherReg(Type::UIntTy);
|
||||||
|
BuildMI(*MBB, IP, X86::ADD32rr, 2, Tmp).addReg(Op1Reg).addReg(Op1Reg);
|
||||||
|
BuildMI(*MBB, IP, X86::ADC32rr, 2, DestReg).addReg(Op0Reg).addReg(Op0Reg);
|
||||||
|
} else if (Amt == 2 || Amt == 3) {
|
||||||
|
// On the P4 and Athlon it is cheaper to replace shld ..., 2|3 with a
|
||||||
|
// shift/lea pair. NOTE: This should not be done on the P6 family!
|
||||||
|
unsigned Tmp = makeAnotherReg(Type::UIntTy);
|
||||||
|
BuildMI(*MBB, IP, X86::SHR32ri, 2, Tmp).addReg(Op1Reg).addImm(32-Amt);
|
||||||
|
X86AddressMode AM;
|
||||||
|
AM.BaseType = X86AddressMode::RegBase;
|
||||||
|
AM.Base.Reg = Tmp;
|
||||||
|
AM.Scale = 1 << Amt;
|
||||||
|
AM.IndexReg = Op0Reg;
|
||||||
|
AM.Disp = 0;
|
||||||
|
addFullAddress(BuildMI(*MBB, IP, X86::LEA32r, 4, DestReg), AM);
|
||||||
|
} else {
|
||||||
|
// NOTE: It is always cheaper on the P4 to emit SHLD as two shifts and an OR
|
||||||
|
// than it is to emit a real SHLD.
|
||||||
|
|
||||||
|
BuildMI(*MBB, IP, X86::SHLD32rri8, 3,
|
||||||
|
DestReg).addReg(Op0Reg).addReg(Op1Reg).addImm(Amt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// emitShiftOperation - Common code shared between visitShiftInst and
|
/// emitShiftOperation - Common code shared between visitShiftInst and
|
||||||
/// constant expression support.
|
/// constant expression support.
|
||||||
void X86ISel::emitShiftOperation(MachineBasicBlock *MBB,
|
void X86ISel::emitShiftOperation(MachineBasicBlock *MBB,
|
||||||
@ -2904,25 +2946,22 @@ void X86ISel::emitShiftOperation(MachineBasicBlock *MBB,
|
|||||||
bool isSigned = ResultTy->isSigned ();
|
bool isSigned = ResultTy->isSigned ();
|
||||||
unsigned Class = getClass (ResultTy);
|
unsigned Class = getClass (ResultTy);
|
||||||
|
|
||||||
static const unsigned ConstantOperand[][4] = {
|
static const unsigned ConstantOperand[][3] = {
|
||||||
{ X86::SHR8ri, X86::SHR16ri, X86::SHR32ri, X86::SHRD32rri8 }, // SHR
|
{ X86::SHR8ri, X86::SHR16ri, X86::SHR32ri }, // SHR
|
||||||
{ X86::SAR8ri, X86::SAR16ri, X86::SAR32ri, X86::SHRD32rri8 }, // SAR
|
{ X86::SAR8ri, X86::SAR16ri, X86::SAR32ri }, // SAR
|
||||||
{ X86::SHL8ri, X86::SHL16ri, X86::SHL32ri, X86::SHLD32rri8 }, // SHL
|
{ X86::SHL8ri, X86::SHL16ri, X86::SHL32ri }, // SHL
|
||||||
{ X86::SHL8ri, X86::SHL16ri, X86::SHL32ri, X86::SHLD32rri8 }, // SAL = SHL
|
{ X86::SHL8ri, X86::SHL16ri, X86::SHL32ri }, // SAL = SHL
|
||||||
};
|
};
|
||||||
|
|
||||||
static const unsigned NonConstantOperand[][4] = {
|
static const unsigned NonConstantOperand[][3] = {
|
||||||
{ X86::SHR8rCL, X86::SHR16rCL, X86::SHR32rCL }, // SHR
|
{ X86::SHR8rCL, X86::SHR16rCL, X86::SHR32rCL }, // SHR
|
||||||
{ X86::SAR8rCL, X86::SAR16rCL, X86::SAR32rCL }, // SAR
|
{ X86::SAR8rCL, X86::SAR16rCL, X86::SAR32rCL }, // SAR
|
||||||
{ X86::SHL8rCL, X86::SHL16rCL, X86::SHL32rCL }, // SHL
|
{ X86::SHL8rCL, X86::SHL16rCL, X86::SHL32rCL }, // SHL
|
||||||
{ X86::SHL8rCL, X86::SHL16rCL, X86::SHL32rCL }, // SAL = SHL
|
{ X86::SHL8rCL, X86::SHL16rCL, X86::SHL32rCL }, // SAL = SHL
|
||||||
};
|
};
|
||||||
|
|
||||||
// Longs, as usual, are handled specially...
|
// Longs, as usual, are handled specially.
|
||||||
if (Class == cLong) {
|
if (Class == cLong) {
|
||||||
// If we have a constant shift, we can generate much more efficient code
|
|
||||||
// than otherwise...
|
|
||||||
//
|
|
||||||
if (ConstantUInt *CUI = dyn_cast<ConstantUInt>(ShiftAmount)) {
|
if (ConstantUInt *CUI = dyn_cast<ConstantUInt>(ShiftAmount)) {
|
||||||
unsigned Amount = CUI->getValue();
|
unsigned Amount = CUI->getValue();
|
||||||
if (Amount == 1 && isLeftShift) { // X << 1 == X+X
|
if (Amount == 1 && isLeftShift) { // X << 1 == X+X
|
||||||
@ -2933,12 +2972,11 @@ void X86ISel::emitShiftOperation(MachineBasicBlock *MBB,
|
|||||||
} else if (Amount < 32) {
|
} else if (Amount < 32) {
|
||||||
const unsigned *Opc = ConstantOperand[isLeftShift*2+isSigned];
|
const unsigned *Opc = ConstantOperand[isLeftShift*2+isSigned];
|
||||||
if (isLeftShift) {
|
if (isLeftShift) {
|
||||||
BuildMI(*MBB, IP, Opc[3], 3,
|
doSHLDConst(MBB, IP, DestReg+1, SrcReg+1, SrcReg, Amount);
|
||||||
DestReg+1).addReg(SrcReg+1).addReg(SrcReg).addImm(Amount);
|
|
||||||
BuildMI(*MBB, IP, Opc[2], 2, DestReg).addReg(SrcReg).addImm(Amount);
|
BuildMI(*MBB, IP, Opc[2], 2, DestReg).addReg(SrcReg).addImm(Amount);
|
||||||
} else {
|
} else {
|
||||||
BuildMI(*MBB, IP, Opc[3], 3,
|
BuildMI(*MBB, IP, X86::SHRD32rri8, 3,
|
||||||
DestReg).addReg(SrcReg ).addReg(SrcReg+1).addImm(Amount);
|
DestReg).addReg(SrcReg ).addReg(SrcReg+1).addImm(Amount);
|
||||||
BuildMI(*MBB, IP, Opc[2],2,DestReg+1).addReg(SrcReg+1).addImm(Amount);
|
BuildMI(*MBB, IP, Opc[2],2,DestReg+1).addReg(SrcReg+1).addImm(Amount);
|
||||||
}
|
}
|
||||||
} else { // Shifting more than 32 bits
|
} else { // Shifting more than 32 bits
|
||||||
|
Loading…
Reference in New Issue
Block a user