- Two-address pass should not assume unfolding is always successful.

- X86 unfolding should check if the instructions being unfolded has memoperands.
  If there is no memoperands, then it must assume conservative alignment. If this
  would introduce an expensive sse unaligned load / store, then unfoldMemoryOperand
  etc. should not unfold the instruction.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@107509 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2010-07-02 20:36:18 +00:00
parent 541481f34c
commit 98ec91ea80
3 changed files with 126 additions and 12 deletions

View File

@ -926,14 +926,12 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI); UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
unsigned Reg = MRI->createVirtualRegister(RC); unsigned Reg = MRI->createVirtualRegister(RC);
SmallVector<MachineInstr *, 2> NewMIs; SmallVector<MachineInstr *, 2> NewMIs;
bool Success = if (!TII->unfoldMemoryOperand(MF, mi, Reg,
TII->unfoldMemoryOperand(MF, mi, Reg,
/*UnfoldLoad=*/true,/*UnfoldStore=*/false, /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
NewMIs); NewMIs)) {
(void)Success; DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
assert(Success && return false;
"unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold " }
"succeeded!");
assert(NewMIs.size() == 2 && assert(NewMIs.size() == 2 &&
"Unfolded a load into multiple instructions!"); "Unfolded a load into multiple instructions!");
// The load was previously folded, so this is the only use. // The load was previously folded, so this is the only use.

View File

@ -2159,7 +2159,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd, MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const { SmallVectorImpl<MachineInstr*> &NewMIs) const {
bool isAligned = (*MMOBegin)->getAlignment() >= 16; bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
DebugLoc DL; DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@ -2189,7 +2189,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd, MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const { SmallVectorImpl<MachineInstr*> &NewMIs) const {
bool isAligned = (*MMOBegin)->getAlignment() >= 16; bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
DebugLoc DL; DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
@ -2693,6 +2693,13 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
const TargetInstrDesc &TID = get(Opc); const TargetInstrDesc &TID = get(Opc);
const TargetOperandInfo &TOI = TID.OpInfo[Index]; const TargetOperandInfo &TOI = TID.OpInfo[Index];
const TargetRegisterClass *RC = TOI.getRegClass(&RI); const TargetRegisterClass *RC = TOI.getRegClass(&RI);
if (!MI->hasOneMemOperand() &&
RC == &X86::VR128RegClass &&
!TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
// conservatively assume the address is unaligned. That's bad for
// performance.
return false;
SmallVector<MachineOperand, X86AddrNumOperands> AddrOps; SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
SmallVector<MachineOperand,2> BeforeOps; SmallVector<MachineOperand,2> BeforeOps;
SmallVector<MachineOperand,2> AfterOps; SmallVector<MachineOperand,2> AfterOps;
@ -2834,7 +2841,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
MachineInstr::mmo_iterator> MMOs = MachineInstr::mmo_iterator> MMOs =
MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
cast<MachineSDNode>(N)->memoperands_end()); cast<MachineSDNode>(N)->memoperands_end());
bool isAligned = (*MMOs.first)->getAlignment() >= 16; if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
!TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
// Do not introduce a slow unaligned load.
return false;
bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl, Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
VT, MVT::Other, &AddrOps[0], AddrOps.size()); VT, MVT::Other, &AddrOps[0], AddrOps.size());
NewNodes.push_back(Load); NewNodes.push_back(Load);
@ -2871,7 +2883,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
MachineInstr::mmo_iterator> MMOs = MachineInstr::mmo_iterator> MMOs =
MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
cast<MachineSDNode>(N)->memoperands_end()); cast<MachineSDNode>(N)->memoperands_end());
bool isAligned = (*MMOs.first)->getAlignment() >= 16; if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
!TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
// Do not introduce a slow unaligned store.
return false;
bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
isAligned, TM), isAligned, TM),
dl, MVT::Other, dl, MVT::Other,

View File

@ -0,0 +1,99 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin
; rdar://8154265
declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
define void @_ZN2CA3OGL20fill_surface_mesh_3dERNS0_7ContextEPKNS_6Render13MeshTransformEPKNS0_5LayerEPNS0_7SurfaceEfNS0_13TextureFilterESC_f() nounwind optsize ssp {
entry:
br i1 undef, label %bb2.thread, label %bb2
bb2.thread: ; preds = %entry
br i1 undef, label %bb41, label %bb10.preheader
bb2: ; preds = %entry
unreachable
bb10.preheader: ; preds = %bb2.thread
br i1 undef, label %bb9, label %bb12
bb9: ; preds = %bb9, %bb10.preheader
br i1 undef, label %bb9, label %bb12
bb12: ; preds = %bb9, %bb10.preheader
br i1 undef, label %bb4.i.i, label %bb3.i.i
bb3.i.i: ; preds = %bb12
unreachable
bb4.i.i: ; preds = %bb12
br i1 undef, label %bb8.i.i, label %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
bb8.i.i: ; preds = %bb4.i.i
br i1 undef, label %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit, label %bb9.i.i
bb9.i.i: ; preds = %bb8.i.i
br i1 undef, label %bb11.i.i, label %bb10.i.i
bb10.i.i: ; preds = %bb9.i.i
unreachable
bb11.i.i: ; preds = %bb9.i.i
unreachable
_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit: ; preds = %bb8.i.i, %bb4.i.i
br i1 undef, label %bb19, label %bb14
bb14: ; preds = %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
unreachable
bb19: ; preds = %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
br i1 undef, label %bb.i50, label %bb6.i
bb.i50: ; preds = %bb19
unreachable
bb6.i: ; preds = %bb19
br i1 undef, label %bb28, label %bb.nph106
bb22: ; preds = %bb24.preheader
br i1 undef, label %bb2.i.i, label %bb.i.i49
bb.i.i49: ; preds = %bb22
%0 = load float* undef, align 4 ; <float> [#uses=1]
%1 = insertelement <4 x float> undef, float %0, i32 0 ; <<4 x float>> [#uses=1]
%2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> %1) nounwind readnone ; <<4 x float>> [#uses=1]
%3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %2, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) nounwind readnone ; <<4 x float>> [#uses=1]
%4 = extractelement <4 x float> %3, i32 0 ; <float> [#uses=1]
store float %4, float* undef, align 4
%5 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> undef) nounwind readnone ; <<4 x float>> [#uses=1]
%6 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %5, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) nounwind readnone ; <<4 x float>> [#uses=1]
%7 = extractelement <4 x float> %6, i32 0 ; <float> [#uses=1]
store float %7, float* undef, align 4
unreachable
bb2.i.i: ; preds = %bb22
unreachable
bb26.loopexit: ; preds = %bb24.preheader
br i1 undef, label %bb28, label %bb24.preheader
bb.nph106: ; preds = %bb6.i
br label %bb24.preheader
bb24.preheader: ; preds = %bb.nph106, %bb26.loopexit
br i1 undef, label %bb22, label %bb26.loopexit
bb28: ; preds = %bb26.loopexit, %bb6.i
unreachable
bb41: ; preds = %bb2.thread
br i1 undef, label %return, label %bb46
bb46: ; preds = %bb41
ret void
return: ; preds = %bb41
ret void
}