mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-14 16:33:28 +00:00
On Sandybridge loading unaligned 256bits using two XMM loads (vmovups and vinsertf128) is faster than using a single vmovups instruction.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172868 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
7336f7febb
commit
48177ac90f
@ -16340,8 +16340,39 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
EVT MemVT = Ld->getMemoryVT();
|
||||
DebugLoc dl = Ld->getDebugLoc();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
unsigned RegSz = RegVT.getSizeInBits();
|
||||
|
||||
ISD::LoadExtType Ext = Ld->getExtensionType();
|
||||
unsigned Alignment = Ld->getAlignment();
|
||||
|
||||
// On Sandybridge unaligned 256bit loads are inefficient.
|
||||
if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
|
||||
!DCI.isBeforeLegalizeOps() && Alignment < 32 &&
|
||||
Ext == ISD::NON_EXTLOAD) {
|
||||
unsigned NumElems = RegVT.getVectorNumElements();
|
||||
SDValue Ptr = Ld->getBasePtr();
|
||||
SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
|
||||
|
||||
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
|
||||
NumElems/2);
|
||||
SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
||||
Ld->getPointerInfo(), Ld->isVolatile(),
|
||||
Ld->isNonTemporal(), Ld->isInvariant(),
|
||||
Alignment);
|
||||
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
|
||||
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
||||
Ld->getPointerInfo(), Ld->isVolatile(),
|
||||
Ld->isNonTemporal(), Ld->isInvariant(),
|
||||
Alignment);
|
||||
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
||||
Load1.getValue(1),
|
||||
Load2.getValue(1));
|
||||
|
||||
SDValue NewVec = DAG.getUNDEF(RegVT);
|
||||
NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
|
||||
NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
|
||||
return DCI.CombineTo(N, NewVec, TF, true);
|
||||
}
|
||||
|
||||
// If this is a vector EXT Load then attempt to optimize it using a
|
||||
// shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
|
||||
@ -16356,7 +16387,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
assert(MemVT.isVector() && "Must load a vector from memory");
|
||||
|
||||
unsigned NumElems = RegVT.getVectorNumElements();
|
||||
unsigned RegSz = RegVT.getSizeInBits();
|
||||
unsigned MemSz = MemVT.getSizeInBits();
|
||||
assert(RegSz > MemSz && "Register size must be greater than the mem size");
|
||||
|
||||
|
21
test/CodeGen/X86/sandybridge-loads.ll
Normal file
21
test/CodeGen/X86/sandybridge-loads.ll
Normal file
@ -0,0 +1,21 @@
|
||||
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
|
||||
|
||||
;CHECK: wideloads
|
||||
;CHECK: vmovaps
|
||||
;CHECK: vinsertf128
|
||||
;CHECK: vmovups
|
||||
;CHECK-NOT: vinsertf128
|
||||
;CHECK: ret
|
||||
|
||||
define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
|
||||
%v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
|
||||
%v1 = load <8 x float>* %b, align 32 ; <---- aligned!
|
||||
%m0 = fcmp olt <8 x float> %v1, %v0
|
||||
%v2 = load <8 x float>* %c, align 16
|
||||
%m1 = fcmp olt <8 x float> %v2, %v0
|
||||
%mand = and <8 x i1> %m1, %m0
|
||||
%r = zext <8 x i1> %mand to <8 x i32>
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 16
|
||||
ret void
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
|
||||
|
||||
;CHECK: and_masks
|
||||
;CHECK: vmovups
|
||||
;CHECK: vmovaps
|
||||
;CHECK: vcmpltp
|
||||
;CHECK: vcmpltp
|
||||
;CHECK: vandps
|
||||
|
Loading…
x
Reference in New Issue
Block a user