mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-02-08 21:32:39 +00:00
[X86] Memory folding for commutative instructions (updated)
This patch improves support for commutative instructions in the x86 memory folding implementation by attempting to fold a commuted version of the instruction if the original folding fails - if that folding fails as well the instruction is 're-commuted' back to its original order before returning. Updated version of r219584 (reverted in r219595) - the commutation attempt now explicitly ensures that neither of the commuted source operands are tied to the destination operand / register, which was the source of all the regressions that occurred with the original patch attempt. Added additional regression test case provided by Joerg Sonnenberger. Differential Revision: http://reviews.llvm.org/D5818 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220239 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
b9c7f60191
commit
0d1978b813
@ -3337,7 +3337,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
|
||||
AM.getFullAddress(AddrOps);
|
||||
|
||||
MachineInstr *Result =
|
||||
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
|
||||
XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
|
||||
Size, Alignment, /*AllowCommute=*/true);
|
||||
if (!Result)
|
||||
return false;
|
||||
|
||||
|
@ -3907,10 +3907,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
|
||||
/// operand at the use. We fold the load instructions if load defines a virtual
|
||||
/// register, the virtual register is used once in the same BB, and the
|
||||
/// instructions in-between do not load or store, and have no side effects.
|
||||
MachineInstr* X86InstrInfo::
|
||||
optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
|
||||
unsigned &FoldAsLoadDefReg,
|
||||
MachineInstr *&DefMI) const {
|
||||
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
|
||||
const MachineRegisterInfo *MRI,
|
||||
unsigned &FoldAsLoadDefReg,
|
||||
MachineInstr *&DefMI) const {
|
||||
if (FoldAsLoadDefReg == 0)
|
||||
return nullptr;
|
||||
// To be conservative, if there exists another load, clear the load candidate.
|
||||
@ -3926,55 +3926,35 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
|
||||
if (!DefMI->isSafeToMove(this, nullptr, SawStore))
|
||||
return nullptr;
|
||||
|
||||
// We try to commute MI if possible.
|
||||
unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
|
||||
for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
|
||||
// Collect information about virtual register operands of MI.
|
||||
unsigned SrcOperandId = 0;
|
||||
bool FoundSrcOperand = false;
|
||||
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
|
||||
MachineOperand &MO = MI->getOperand(i);
|
||||
if (!MO.isReg())
|
||||
continue;
|
||||
unsigned Reg = MO.getReg();
|
||||
if (Reg != FoldAsLoadDefReg)
|
||||
continue;
|
||||
// Do not fold if we have a subreg use or a def or multiple uses.
|
||||
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
|
||||
return nullptr;
|
||||
|
||||
SrcOperandId = i;
|
||||
FoundSrcOperand = true;
|
||||
}
|
||||
if (!FoundSrcOperand) return nullptr;
|
||||
|
||||
// Check whether we can fold the def into SrcOperandId.
|
||||
SmallVector<unsigned, 8> Ops;
|
||||
Ops.push_back(SrcOperandId);
|
||||
MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
|
||||
if (FoldMI) {
|
||||
FoldAsLoadDefReg = 0;
|
||||
return FoldMI;
|
||||
}
|
||||
|
||||
if (Idx == 1) {
|
||||
// MI was changed but it didn't help, commute it back!
|
||||
commuteInstruction(MI, false);
|
||||
// Collect information about virtual register operands of MI.
|
||||
unsigned SrcOperandId = 0;
|
||||
bool FoundSrcOperand = false;
|
||||
for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
|
||||
MachineOperand &MO = MI->getOperand(i);
|
||||
if (!MO.isReg())
|
||||
continue;
|
||||
unsigned Reg = MO.getReg();
|
||||
if (Reg != FoldAsLoadDefReg)
|
||||
continue;
|
||||
// Do not fold if we have a subreg use or a def or multiple uses.
|
||||
if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Check whether we can commute MI and enable folding.
|
||||
if (MI->isCommutable()) {
|
||||
MachineInstr *NewMI = commuteInstruction(MI, false);
|
||||
// Unable to commute.
|
||||
if (!NewMI) return nullptr;
|
||||
if (NewMI != MI) {
|
||||
// New instruction. It doesn't need to be kept.
|
||||
NewMI->eraseFromParent();
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
SrcOperandId = i;
|
||||
FoundSrcOperand = true;
|
||||
}
|
||||
if (!FoundSrcOperand)
|
||||
return nullptr;
|
||||
|
||||
// Check whether we can fold the def into SrcOperandId.
|
||||
SmallVector<unsigned, 8> Ops;
|
||||
Ops.push_back(SrcOperandId);
|
||||
MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
|
||||
if (FoldMI) {
|
||||
FoldAsLoadDefReg = 0;
|
||||
return FoldMI;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -4134,7 +4114,8 @@ MachineInstr*
|
||||
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
MachineInstr *MI, unsigned i,
|
||||
const SmallVectorImpl<MachineOperand> &MOs,
|
||||
unsigned Size, unsigned Align) const {
|
||||
unsigned Size, unsigned Align,
|
||||
bool AllowCommute) const {
|
||||
const DenseMap<unsigned,
|
||||
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
|
||||
bool isCallRegIndirect = Subtarget.callRegIndirect();
|
||||
@ -4202,8 +4183,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
|
||||
return nullptr;
|
||||
// If this is a 64-bit load, but the spill slot is 32, then we can do
|
||||
// a 32-bit load which is implicitly zero-extended. This likely is due
|
||||
// to liveintervalanalysis remat'ing a load from stack slot.
|
||||
// a 32-bit load which is implicitly zero-extended. This likely is
|
||||
// due to live interval analysis remat'ing a load from stack slot.
|
||||
if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
|
||||
return nullptr;
|
||||
Opcode = X86::MOV32rm;
|
||||
@ -4222,8 +4203,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
// to a 32-bit one.
|
||||
unsigned DstReg = NewMI->getOperand(0).getReg();
|
||||
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
|
||||
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
|
||||
X86::sub_32bit));
|
||||
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
|
||||
else
|
||||
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
|
||||
}
|
||||
@ -4231,6 +4211,65 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
}
|
||||
}
|
||||
|
||||
// If the instruction and target operand are commutable, commute the
|
||||
// instruction and try again.
|
||||
if (AllowCommute) {
|
||||
unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
|
||||
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
|
||||
bool HasDef = MI->getDesc().getNumDefs();
|
||||
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
|
||||
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
|
||||
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
|
||||
bool Tied0 =
|
||||
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
|
||||
bool Tied1 =
|
||||
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
|
||||
|
||||
// If either of the commutable operands are tied to the destination
|
||||
// then we can not commute + fold.
|
||||
if ((HasDef && Reg0 == Reg1 && Tied0) ||
|
||||
(HasDef && Reg0 == Reg2 && Tied1))
|
||||
return nullptr;
|
||||
|
||||
if ((CommuteOpIdx1 == OriginalOpIdx) ||
|
||||
(CommuteOpIdx2 == OriginalOpIdx)) {
|
||||
MachineInstr *CommutedMI = commuteInstruction(MI, false);
|
||||
if (!CommutedMI) {
|
||||
// Unable to commute.
|
||||
return nullptr;
|
||||
}
|
||||
if (CommutedMI != MI) {
|
||||
// New instruction. We can't fold from this.
|
||||
CommutedMI->eraseFromParent();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Attempt to fold with the commuted version of the instruction.
|
||||
unsigned CommuteOp =
|
||||
(CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
|
||||
NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
|
||||
/*AllowCommute=*/false);
|
||||
if (NewMI)
|
||||
return NewMI;
|
||||
|
||||
// Folding failed again - undo the commute before returning.
|
||||
MachineInstr *UncommutedMI = commuteInstruction(MI, false);
|
||||
if (!UncommutedMI) {
|
||||
// Unable to commute.
|
||||
return nullptr;
|
||||
}
|
||||
if (UncommutedMI != MI) {
|
||||
// New instruction. It doesn't need to be kept.
|
||||
UncommutedMI->eraseFromParent();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Return here to prevent duplicate fuse failure report.
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No fusion
|
||||
if (PrintFailedFusing && !MI->isCopy())
|
||||
dbgs() << "We failed to fuse operand " << i << " in " << *MI;
|
||||
@ -4440,7 +4479,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
|
||||
|
||||
SmallVector<MachineOperand,4> MOs;
|
||||
MOs.push_back(MachineOperand::CreateFI(FrameIndex));
|
||||
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
|
||||
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
|
||||
Size, Alignment, /*AllowCommute=*/true);
|
||||
}
|
||||
|
||||
static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
|
||||
@ -4593,7 +4633,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
|
||||
break;
|
||||
}
|
||||
}
|
||||
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
|
||||
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
|
||||
/*Size=*/0, Alignment, /*AllowCommute=*/true);
|
||||
}
|
||||
|
||||
|
||||
|
@ -404,7 +404,8 @@ public:
|
||||
MachineInstr* MI,
|
||||
unsigned OpNum,
|
||||
const SmallVectorImpl<MachineOperand> &MOs,
|
||||
unsigned Size, unsigned Alignment) const;
|
||||
unsigned Size, unsigned Alignment,
|
||||
bool AllowCommute) const;
|
||||
|
||||
void
|
||||
getUnconditionalBranch(MCInst &Branch,
|
||||
|
16
test/CodeGen/X86/avx1-stack-reload-folding.ll
Normal file
16
test/CodeGen/X86/avx1-stack-reload-folding.ll
Normal file
@ -0,0 +1,16 @@
|
||||
; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-unknown"
|
||||
|
||||
; Function Attrs: nounwind readonly uwtable
|
||||
define <32 x double> @_Z14vstack_foldDv32_dS_(<32 x double> %a, <32 x double> %b) #0 {
|
||||
%1 = fadd <32 x double> %a, %b
|
||||
%2 = fsub <32 x double> %a, %b
|
||||
%3 = fmul <32 x double> %1, %2
|
||||
ret <32 x double> %3
|
||||
|
||||
;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload
|
||||
;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
|
||||
;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload
|
||||
}
|
84
test/CodeGen/X86/fold-tied-op.ll
Normal file
84
test/CodeGen/X86/fold-tied-op.ll
Normal file
@ -0,0 +1,84 @@
|
||||
; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
|
||||
; Regression test for http://reviews.llvm.org/D5701
|
||||
|
||||
; ModuleID = 'xxhash.i'
|
||||
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
|
||||
target triple = "i386--netbsd"
|
||||
|
||||
; CHECK-LABEL: fn1
|
||||
; CHECK: shldl {{.*#+}} 4-byte Folded Spill
|
||||
; CHECK: orl {{.*#+}} 4-byte Folded Reload
|
||||
; CHECK: shldl {{.*#+}} 4-byte Folded Spill
|
||||
; CHECK: orl {{.*#+}} 4-byte Folded Reload
|
||||
; CHECK: addl {{.*#+}} 4-byte Folded Reload
|
||||
; CHECK: imull {{.*#+}} 4-byte Folded Reload
|
||||
; CHECK: orl {{.*#+}} 4-byte Folded Reload
|
||||
; CHECK: retl
|
||||
|
||||
%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
|
||||
|
||||
@a = common global i32 0, align 4
|
||||
@b = common global i64 0, align 8
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define i64 @fn1() #0 {
|
||||
entry:
|
||||
%0 = load i32* @a, align 4, !tbaa !1
|
||||
%1 = inttoptr i32 %0 to %struct.XXH_state64_t*
|
||||
%total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
|
||||
%2 = load i32* %total_len, align 4, !tbaa !5
|
||||
%tobool = icmp eq i32 %2, 0
|
||||
br i1 %tobool, label %if.else, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
|
||||
%3 = load i64* %v3, align 4, !tbaa !8
|
||||
%v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
|
||||
%4 = load i64* %v4, align 4, !tbaa !9
|
||||
%v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
|
||||
%5 = load i64* %v2, align 4, !tbaa !10
|
||||
%shl = shl i64 %5, 1
|
||||
%or = or i64 %shl, %5
|
||||
%shl2 = shl i64 %3, 2
|
||||
%shr = lshr i64 %3, 1
|
||||
%or3 = or i64 %shl2, %shr
|
||||
%add = add i64 %or, %or3
|
||||
%mul = mul i64 %4, -4417276706812531889
|
||||
%shl4 = mul i64 %4, -8834553413625063778
|
||||
%shr5 = ashr i64 %mul, 3
|
||||
%or6 = or i64 %shr5, %shl4
|
||||
%mul7 = mul nsw i64 %or6, 1400714785074694791
|
||||
%xor = xor i64 %add, %mul7
|
||||
store i64 %xor, i64* @b, align 8, !tbaa !11
|
||||
%mul8 = mul nsw i64 %xor, 1400714785074694791
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %entry
|
||||
%6 = load i64* @b, align 8, !tbaa !11
|
||||
%xor10 = xor i64 %6, -4417276706812531889
|
||||
%mul11 = mul nsw i64 %xor10, 400714785074694791
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
%storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
|
||||
%storemerge = add i64 %storemerge.in, -8796714831421723037
|
||||
store i64 %storemerge, i64* @b, align 8, !tbaa !11
|
||||
ret i64 undef
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!llvm.ident = !{!0}
|
||||
|
||||
!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
|
||||
!1 = metadata !{metadata !2, metadata !2, i64 0}
|
||||
!2 = metadata !{metadata !"int", metadata !3, i64 0}
|
||||
!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
|
||||
!4 = metadata !{metadata !"Simple C/C++ TBAA"}
|
||||
!5 = metadata !{metadata !6, metadata !2, i64 0}
|
||||
!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
|
||||
!7 = metadata !{metadata !"long long", metadata !3, i64 0}
|
||||
!8 = metadata !{metadata !6, metadata !7, i64 16}
|
||||
!9 = metadata !{metadata !6, metadata !7, i64 24}
|
||||
!10 = metadata !{metadata !6, metadata !7, i64 8}
|
||||
!11 = metadata !{metadata !7, metadata !7, i64 0}
|
Loading…
x
Reference in New Issue
Block a user