mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 04:30:23 +00:00
On Sandybridge split unaligned 256bit stores into two xmm-sized stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172894 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
cfcab21e4d
commit
ba95865441
@ -16344,12 +16344,15 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
ISD::LoadExtType Ext = Ld->getExtensionType();
|
||||
unsigned Alignment = Ld->getAlignment();
|
||||
bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
|
||||
|
||||
// On Sandybridge unaligned 256bit loads are inefficient.
|
||||
if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
|
||||
!DCI.isBeforeLegalizeOps() && Alignment < 32 &&
|
||||
Ext == ISD::NON_EXTLOAD) {
|
||||
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
|
||||
unsigned NumElems = RegVT.getVectorNumElements();
|
||||
if (NumElems < 2)
|
||||
return SDValue();
|
||||
|
||||
SDValue Ptr = Ld->getBasePtr();
|
||||
SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
|
||||
|
||||
@ -16363,7 +16366,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
||||
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
|
||||
Ld->getPointerInfo(), Ld->isVolatile(),
|
||||
Ld->isNonTemporal(), Ld->isInvariant(),
|
||||
Alignment);
|
||||
std::max(Alignment/2U, 1U));
|
||||
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
||||
Load1.getValue(1),
|
||||
Load2.getValue(1));
|
||||
@ -16536,16 +16539,21 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
|
||||
DebugLoc dl = St->getDebugLoc();
|
||||
SDValue StoredVal = St->getOperand(1);
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
unsigned Alignment = St->getAlignment();
|
||||
bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
|
||||
|
||||
// If we are saving a concatenation of two XMM registers, perform two stores.
|
||||
// On Sandy Bridge, 256-bit memory operations are executed by two
|
||||
// 128-bit ports. However, on Haswell it is better to issue a single 256-bit
|
||||
// memory operation.
|
||||
if (VT.is256BitVector() && !Subtarget->hasInt256() &&
|
||||
StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
|
||||
StoredVal.getNumOperands() == 2) {
|
||||
SDValue Value0 = StoredVal.getOperand(0);
|
||||
SDValue Value1 = StoredVal.getOperand(1);
|
||||
StVT == VT && !IsAligned) {
|
||||
unsigned NumElems = VT.getVectorNumElements();
|
||||
if (NumElems < 2)
|
||||
return SDValue();
|
||||
|
||||
SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
|
||||
SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
|
||||
|
||||
SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
|
||||
SDValue Ptr0 = St->getBasePtr();
|
||||
@ -16553,10 +16561,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
|
||||
St->getPointerInfo(), St->isVolatile(),
|
||||
St->isNonTemporal(), St->getAlignment());
|
||||
St->isNonTemporal(), Alignment);
|
||||
SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
|
||||
St->getPointerInfo(), St->isVolatile(),
|
||||
St->isNonTemporal(), St->getAlignment());
|
||||
St->isNonTemporal(),
|
||||
std::max(Alignment/2U, 1U));
|
||||
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
;CHECK: add18i16
|
||||
define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
|
||||
;CHECK: vmovups
|
||||
;CHECK: vmovaps
|
||||
%b = load <18 x i16>* %bp, align 16
|
||||
%x = add <18 x i16> zeroinitializer, %b
|
||||
store <18 x i16> %x, <18 x i16>* %ret, align 16
|
||||
|
@ -42,7 +42,7 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt
|
||||
|
||||
; Move the constants using a single vector store.
|
||||
; CHECK: merge_const_store_vec
|
||||
; CHECK: vmovups %ymm0, (%rsi)
|
||||
; CHECK: vmovups
|
||||
; CHECK: ret
|
||||
define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
|
||||
%1 = icmp sgt i32 %count, 0
|
||||
|
@ -53,19 +53,24 @@ define void @storev16i16(<16 x i16> %a) nounwind {
|
||||
unreachable
|
||||
}
|
||||
|
||||
; CHECK: vmovups %ymm
|
||||
; CHECK: storev16i16_01
|
||||
; CHECK: vextractf128
|
||||
; CHECK: vmovaps %xmm
|
||||
define void @storev16i16_01(<16 x i16> %a) nounwind {
|
||||
store <16 x i16> %a, <16 x i16>* undef, align 4
|
||||
unreachable
|
||||
}
|
||||
|
||||
; CHECK: storev32i8
|
||||
; CHECK: vmovaps %ymm
|
||||
define void @storev32i8(<32 x i8> %a) nounwind {
|
||||
store <32 x i8> %a, <32 x i8>* undef, align 32
|
||||
unreachable
|
||||
}
|
||||
|
||||
; CHECK: vmovups %ymm
|
||||
; CHECK: storev32i8_01
|
||||
; CHECK: vextractf128
|
||||
; CHECK: vmovups %xmm
|
||||
define void @storev32i8_01(<32 x i8> %a) nounwind {
|
||||
store <32 x i8> %a, <32 x i8>* undef, align 4
|
||||
unreachable
|
||||
@ -76,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind {
|
||||
; CHECK: _double_save
|
||||
; CHECK-NOT: vinsertf128 $1
|
||||
; CHECK-NOT: vinsertf128 $0
|
||||
; CHECK: vmovaps %xmm
|
||||
; CHECK: vmovups %xmm
|
||||
; CHECK: vmovaps %xmm
|
||||
define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
|
||||
entry:
|
||||
|
@ -186,18 +186,6 @@ define void @sext_4(<4 x i16>* %inbuf, <4 x i64>* %outbuf) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; AVX: sext_5
|
||||
; AVX: vpmovsxbw
|
||||
; AVX: vpmovsxwd
|
||||
; AVX: vpmovsxwd
|
||||
; AVX: vpmovsxdq
|
||||
; AVX: ret
|
||||
define void @sext_5(<8 x i8>* %inbuf, <8 x i64>* %outbuf) {
|
||||
%v0 = load <8 x i8>* %inbuf
|
||||
%r = sext <8 x i8> %v0 to <8 x i64>
|
||||
store <8 x i64> %r, <8 x i64>* %outbuf
|
||||
ret void
|
||||
}
|
||||
; AVX: sext_6
|
||||
; AVX: vpmovsxbw
|
||||
; AVX: vpmovsxwd
|
||||
|
@ -49,8 +49,8 @@ define <8 x float> @test4(<8 x double>* %p) nounwind {
|
||||
; CHECK: movlhps
|
||||
; CHECK: ret
|
||||
; AVX: test4
|
||||
; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}})
|
||||
; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}})
|
||||
; AVX: vcvtpd2psy
|
||||
; AVX: vcvtpd2psy
|
||||
; AVX: vinsertf128
|
||||
; AVX: ret
|
||||
%x = load <8 x double>* %p
|
||||
|
@ -3,7 +3,7 @@
|
||||
;CHECK: wideloads
|
||||
;CHECK: vmovaps
|
||||
;CHECK: vinsertf128
|
||||
;CHECK: vmovups
|
||||
;CHECK: vmovaps
|
||||
;CHECK-NOT: vinsertf128
|
||||
;CHECK: ret
|
||||
|
||||
@ -11,11 +11,29 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
|
||||
%v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
|
||||
%v1 = load <8 x float>* %b, align 32 ; <---- aligned!
|
||||
%m0 = fcmp olt <8 x float> %v1, %v0
|
||||
%v2 = load <8 x float>* %c, align 16
|
||||
%v2 = load <8 x float>* %c, align 32 ; <---- aligned!
|
||||
%m1 = fcmp olt <8 x float> %v2, %v0
|
||||
%mand = and <8 x i1> %m1, %m0
|
||||
%r = zext <8 x i1> %mand to <8 x i32>
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 16
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: widestores
|
||||
; loads:
|
||||
; CHECK: vmovaps
|
||||
; CHECK: vmovaps
|
||||
; stores:
|
||||
; CHECK: vmovaps
|
||||
; CHECK: vextractf128
|
||||
; CHECK: vmovaps
|
||||
;CHECK: ret
|
||||
|
||||
define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
|
||||
%v0 = load <8 x float>* %a, align 32
|
||||
%v1 = load <8 x float>* %b, align 32
|
||||
store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned
|
||||
store <8 x float> %v1, <8 x float>* %a, align 16 ; <--- unaligned
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
;CHECK: vcmpltp
|
||||
;CHECK: vandps
|
||||
;CHECK: vandps
|
||||
;CHECK: vmovups
|
||||
;CHECK: vmovaps
|
||||
;CHECK: ret
|
||||
|
||||
define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
|
||||
@ -17,7 +17,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
|
||||
%m1 = fcmp olt <8 x float> %v2, %v0
|
||||
%mand = and <8 x i1> %m1, %m0
|
||||
%r = zext <8 x i1> %mand to <8 x i32>
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 16
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -25,7 +25,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
|
||||
;CHECK: vcmpltps
|
||||
;CHECK: vxorps
|
||||
;CHECK: vandps
|
||||
;CHECK: vmovups
|
||||
;CHECK: vmovaps
|
||||
;CHECK: ret
|
||||
define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
|
||||
%v0 = load <8 x float>* %a, align 16
|
||||
@ -33,7 +33,7 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
|
||||
%m0 = fcmp olt <8 x float> %v1, %v0
|
||||
%mand = xor <8 x i1> %m0, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
|
||||
%r = zext <8 x i1> %mand to <8 x i32>
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 16
|
||||
store <8 x i32> %r, <8 x i32>* undef, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
|
@ -29,8 +29,8 @@ entry:
|
||||
; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
|
||||
; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
|
||||
; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
|
||||
; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
|
||||
; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
|
||||
; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
|
||||
%0 = load <8 x float>* %in
|
||||
%1 = fpext <8 x float> %0 to <8 x double>
|
||||
store <8 x double> %1, <8 x double>* %out, align 1
|
||||
|
Loading…
Reference in New Issue
Block a user