mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-03-02 22:32:08 +00:00
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit
permutations. Also tidy up some patterns and make them close to their instruction definition! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138392 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
4477d691ed
commit
d8b7dd5252
@ -4188,20 +4188,21 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
|
|||||||
assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
|
assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
|
||||||
&& "Vector size not supported");
|
&& "Vector size not supported");
|
||||||
|
|
||||||
bool Is128 = VT.getSizeInBits() == 128;
|
if (VT.getSizeInBits() == 128) {
|
||||||
EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
|
V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
|
||||||
V = DAG.getNode(ISD::BITCAST, dl, NVT, V);
|
|
||||||
|
|
||||||
if (Is128) {
|
|
||||||
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
|
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
|
||||||
V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
|
V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
|
||||||
|
&SplatMask[0]);
|
||||||
} else {
|
} else {
|
||||||
// The second half of indicies refer to the higher part, which is a
|
// To use VPERMILPS to splat scalars, the second half of indicies must
|
||||||
// duplication of the lower one. This makes this shuffle a perfect match
|
// refer to the higher part, which is a duplication of the lower one,
|
||||||
// for the VPERM instruction.
|
// because VPERMILPS can only handle in-lane permutations.
|
||||||
int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
|
int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
|
||||||
EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
|
EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
|
||||||
V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
|
|
||||||
|
V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
|
||||||
|
V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
|
||||||
|
&SplatMask[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return DAG.getNode(ISD::BITCAST, dl, VT, V);
|
return DAG.getNode(ISD::BITCAST, dl, VT, V);
|
||||||
@ -4217,6 +4218,9 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
|
|||||||
int NumElems = SrcVT.getVectorNumElements();
|
int NumElems = SrcVT.getVectorNumElements();
|
||||||
unsigned Size = SrcVT.getSizeInBits();
|
unsigned Size = SrcVT.getSizeInBits();
|
||||||
|
|
||||||
|
assert(((Size == 128 && NumElems > 4) || Size == 256) &&
|
||||||
|
"Unknown how to promote splat for type");
|
||||||
|
|
||||||
// Extract the 128-bit part containing the splat element and update
|
// Extract the 128-bit part containing the splat element and update
|
||||||
// the splat element index when it refers to the higher register.
|
// the splat element index when it refers to the higher register.
|
||||||
if (Size == 256) {
|
if (Size == 256) {
|
||||||
@ -4229,16 +4233,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
|
|||||||
// All i16 and i8 vector types can't be used directly by a generic shuffle
|
// All i16 and i8 vector types can't be used directly by a generic shuffle
|
||||||
// instruction because the target has no such instruction. Generate shuffles
|
// instruction because the target has no such instruction. Generate shuffles
|
||||||
// which repeat i16 and i8 several times until they fit in i32, and then can
|
// which repeat i16 and i8 several times until they fit in i32, and then can
|
||||||
// be manipulated by target suported shuffles. After the insertion of the
|
// be manipulated by target suported shuffles.
|
||||||
// necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
|
|
||||||
EVT EltVT = SrcVT.getVectorElementType();
|
EVT EltVT = SrcVT.getVectorElementType();
|
||||||
if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16))
|
if (EltVT == MVT::i8 || EltVT == MVT::i16)
|
||||||
V1 = PromoteSplati8i16(V1, DAG, EltNo);
|
V1 = PromoteSplati8i16(V1, DAG, EltNo);
|
||||||
|
|
||||||
// Recreate the 256-bit vector and place the same 128-bit vector
|
// Recreate the 256-bit vector and place the same 128-bit vector
|
||||||
// into the low and high part. This is necessary because we want
|
// into the low and high part. This is necessary because we want
|
||||||
// to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
|
// to use VPERM* to shuffle the vectors
|
||||||
// inside each separate v4f32 lane.
|
|
||||||
if (Size == 256) {
|
if (Size == 256) {
|
||||||
SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
|
SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
|
||||||
DAG.getConstant(0, MVT::i32), DAG, dl);
|
DAG.getConstant(0, MVT::i32), DAG, dl);
|
||||||
@ -6211,6 +6213,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
|
|||||||
// Handle splat operations
|
// Handle splat operations
|
||||||
if (SVOp->isSplat()) {
|
if (SVOp->isSplat()) {
|
||||||
unsigned NumElem = VT.getVectorNumElements();
|
unsigned NumElem = VT.getVectorNumElements();
|
||||||
|
int Size = VT.getSizeInBits();
|
||||||
// Special case, this is the only place now where it's allowed to return
|
// Special case, this is the only place now where it's allowed to return
|
||||||
// a vector_shuffle operation without using a target specific node, because
|
// a vector_shuffle operation without using a target specific node, because
|
||||||
// *hopefully* it will be optimized away by the dag combiner. FIXME: should
|
// *hopefully* it will be optimized away by the dag combiner. FIXME: should
|
||||||
@ -6223,7 +6226,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
|
|||||||
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
|
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
|
||||||
|
|
||||||
// Handle splats by matching through known shuffle masks
|
// Handle splats by matching through known shuffle masks
|
||||||
if (VT.is128BitVector() && NumElem <= 4)
|
if ((Size == 128 && NumElem <= 4) ||
|
||||||
|
(Size == 256 && NumElem < 8))
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
// All remaning splats are promoted to target supported vector shuffles.
|
// All remaning splats are promoted to target supported vector shuffles.
|
||||||
|
@ -473,13 +473,90 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
|
|||||||
(v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
|
(v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
|
||||||
}
|
}
|
||||||
|
|
||||||
def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
|
let Predicates = [HasAVX] in {
|
||||||
(MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
|
// MOVHPS patterns
|
||||||
let AddedComplexity = 20 in {
|
def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
|
||||||
def : Pat<(v4f32 (movddup VR128:$src, (undef))),
|
(VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
|
||||||
(MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
|
def : Pat<(X86Movlhps VR128:$src1,
|
||||||
def : Pat<(v2i64 (movddup VR128:$src, (undef))),
|
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
|
||||||
(MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
|
(VMOVHPSrm VR128:$src1, addr:$src2)>;
|
||||||
|
def : Pat<(X86Movlhps VR128:$src1,
|
||||||
|
(bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
|
||||||
|
(VMOVHPSrm VR128:$src1, addr:$src2)>;
|
||||||
|
|
||||||
|
// MOVLHPS patterns
|
||||||
|
let AddedComplexity = 20 in {
|
||||||
|
def : Pat<(v4f32 (movddup VR128:$src, (undef))),
|
||||||
|
(VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
|
||||||
|
def : Pat<(v2i64 (movddup VR128:$src, (undef))),
|
||||||
|
(VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
|
||||||
|
|
||||||
|
// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
|
||||||
|
def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
}
|
||||||
|
def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
|
||||||
|
|
||||||
|
// MOVHLPS patterns
|
||||||
|
let AddedComplexity = 20 in {
|
||||||
|
// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
|
||||||
|
def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
|
||||||
|
(VMOVHLPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
|
||||||
|
// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
|
||||||
|
def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
|
||||||
|
(VMOVHLPSrr VR128:$src1, VR128:$src1)>;
|
||||||
|
def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
|
||||||
|
(VMOVHLPSrr VR128:$src1, VR128:$src1)>;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Predicates = [HasSSE1] in {
|
||||||
|
// MOVHPS patterns
|
||||||
|
def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
|
||||||
|
(MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
|
||||||
|
def : Pat<(X86Movlhps VR128:$src1,
|
||||||
|
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
|
||||||
|
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
||||||
|
def : Pat<(X86Movlhps VR128:$src1,
|
||||||
|
(bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
|
||||||
|
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
||||||
|
|
||||||
|
// MOVLHPS patterns
|
||||||
|
let AddedComplexity = 20 in {
|
||||||
|
def : Pat<(v4f32 (movddup VR128:$src, (undef))),
|
||||||
|
(MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
|
||||||
|
def : Pat<(v2i64 (movddup VR128:$src, (undef))),
|
||||||
|
(MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
|
||||||
|
|
||||||
|
// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
|
||||||
|
def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
}
|
||||||
|
def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
|
||||||
|
(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
|
||||||
|
|
||||||
|
// MOVHLPS patterns
|
||||||
|
let AddedComplexity = 20 in {
|
||||||
|
// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
|
||||||
|
def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
|
||||||
|
(MOVHLPSrr VR128:$src1, VR128:$src2)>;
|
||||||
|
|
||||||
|
// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
|
||||||
|
def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
|
||||||
|
(MOVHLPSrr VR128:$src1, VR128:$src1)>;
|
||||||
|
def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
|
||||||
|
(MOVHLPSrr VR128:$src1, VR128:$src1)>;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
@ -4010,22 +4087,6 @@ def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
|
|||||||
(SHUFFLE_get_shuf_imm VR128:$src3))>,
|
(SHUFFLE_get_shuf_imm VR128:$src3))>,
|
||||||
Requires<[HasSSE2]>;
|
Requires<[HasSSE2]>;
|
||||||
|
|
||||||
let AddedComplexity = 20 in {
|
|
||||||
// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
|
|
||||||
def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
|
||||||
|
|
||||||
// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
|
|
||||||
def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVHLPSrr VR128:$src1, VR128:$src2)>;
|
|
||||||
|
|
||||||
// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
|
|
||||||
def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
|
|
||||||
(MOVHLPSrr VR128:$src1, VR128:$src1)>;
|
|
||||||
def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
|
|
||||||
(MOVHLPSrr VR128:$src1, VR128:$src1)>;
|
|
||||||
}
|
|
||||||
|
|
||||||
let AddedComplexity = 20 in {
|
let AddedComplexity = 20 in {
|
||||||
// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
|
// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
|
||||||
def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
|
def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
|
||||||
@ -6023,20 +6084,6 @@ def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))),
|
|||||||
def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
|
def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
|
||||||
(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
|
(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
|
||||||
|
|
||||||
// Shuffle with MOVLHPS
|
|
||||||
def : Pat<(X86Movlhps VR128:$src1,
|
|
||||||
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
|
|
||||||
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
|
||||||
def : Pat<(X86Movlhps VR128:$src1,
|
|
||||||
(bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
|
|
||||||
(MOVHPSrm VR128:$src1, addr:$src2)>;
|
|
||||||
def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
|
||||||
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVLHPSrr VR128:$src1, VR128:$src2)>;
|
|
||||||
def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
|
|
||||||
(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
|
|
||||||
|
|
||||||
// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem
|
// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem
|
||||||
// is during lowering, where it's not possible to recognize the load fold cause
|
// is during lowering, where it's not possible to recognize the load fold cause
|
||||||
// it has two uses through a bitcast. One use disappears at isel time and the
|
// it has two uses through a bitcast. One use disappears at isel time and the
|
||||||
|
@ -21,8 +21,8 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vmovd
|
; CHECK: vmovd
|
||||||
|
; CHECK-NEXT: vmovlhps %xmm
|
||||||
; CHECK-NEXT: vinsertf128 $1
|
; CHECK-NEXT: vinsertf128 $1
|
||||||
; CHECK-NEXT: vpermilps $0
|
|
||||||
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
|
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
|
||||||
entry:
|
entry:
|
||||||
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
|
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
|
||||||
@ -32,8 +32,8 @@ entry:
|
|||||||
ret <4 x i64> %vecinit6.i
|
ret <4 x i64> %vecinit6.i
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vinsertf128 $1
|
; CHECK: vshufpd $0
|
||||||
; CHECK-NEXT: vpermilps $0
|
; CHECK-NEXT: vinsertf128 $1
|
||||||
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
|
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
|
||||||
entry:
|
entry:
|
||||||
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
|
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
|
||||||
|
Loading…
x
Reference in New Issue
Block a user