From d8b7dd52525e99b998544bbeecb56907587b25a9 Mon Sep 17 00:00:00 2001 From: Bruno Cardoso Lopes Date: Tue, 23 Aug 2011 22:06:37 +0000 Subject: [PATCH] Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138392 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 36 +++++---- lib/Target/X86/X86InstrSSE.td | 125 ++++++++++++++++++++--------- test/CodeGen/X86/avx-splat.ll | 6 +- 3 files changed, 109 insertions(+), 58 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d2b76904893..88566c687e2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4188,20 +4188,21 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) && "Vector size not supported"); - bool Is128 = VT.getSizeInBits() == 128; - EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32; - V = DAG.getNode(ISD::BITCAST, dl, NVT, V); - - if (Is128) { + if (VT.getSizeInBits() == 128) { + V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; - V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); + V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), + &SplatMask[0]); } else { - // The second half of indicies refer to the higher part, which is a - // duplication of the lower one. This makes this shuffle a perfect match - // for the VPERM instruction. + // To use VPERMILPS to splat scalars, the second half of indicies must + // refer to the higher part, which is a duplication of the lower one, + // because VPERMILPS can only handle in-lane permutations. int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; - V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); + + V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); + V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), + &SplatMask[0]); } return DAG.getNode(ISD::BITCAST, dl, VT, V); @@ -4217,6 +4218,9 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { int NumElems = SrcVT.getVectorNumElements(); unsigned Size = SrcVT.getSizeInBits(); + assert(((Size == 128 && NumElems > 4) || Size == 256) && + "Unknown how to promote splat for type"); + // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Size == 256) { @@ -4229,16 +4233,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // All i16 and i8 vector types can't be used directly by a generic shuffle // instruction because the target has no such instruction. Generate shuffles // which repeat i16 and i8 several times until they fit in i32, and then can - // be manipulated by target suported shuffles. After the insertion of the - // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. + // be manipulated by target suported shuffles. EVT EltVT = SrcVT.getVectorElementType(); - if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16)) + if (EltVT == MVT::i8 || EltVT == MVT::i16) V1 = PromoteSplati8i16(V1, DAG, EltNo); // Recreate the 256-bit vector and place the same 128-bit vector // into the low and high part. This is necessary because we want - // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles - // inside each separate v4f32 lane. + // to use VPERM* to shuffle the vectors if (Size == 256) { SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, DAG.getConstant(0, MVT::i32), DAG, dl); @@ -6211,6 +6213,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, // Handle splat operations if (SVOp->isSplat()) { unsigned NumElem = VT.getVectorNumElements(); + int Size = VT.getSizeInBits(); // Special case, this is the only place now where it's allowed to return // a vector_shuffle operation without using a target specific node, because // *hopefully* it will be optimized away by the dag combiner. FIXME: should @@ -6223,7 +6226,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); // Handle splats by matching through known shuffle masks - if (VT.is128BitVector() && NumElem <= 4) + if ((Size == 128 && NumElem <= 4) || + (Size == 256 && NumElem < 8)) return SDValue(); // All remaning splats are promoted to target supported vector shuffles. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d1f74842c37..40dd294d741 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -473,13 +473,90 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>; } -def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), - (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; -let AddedComplexity = 20 in { - def : Pat<(v4f32 (movddup VR128:$src, (undef))), - (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; - def : Pat<(v2i64 (movddup VR128:$src, (undef))), - (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; +let Predicates = [HasAVX] in { + // MOVHPS patterns + def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + + // MOVLHPS patterns + let AddedComplexity = 20 in { + def : Pat<(v4f32 (movddup VR128:$src, (undef))), + (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; + def : Pat<(v2i64 (movddup VR128:$src, (undef))), + (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; + + // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS + def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + } + def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + let AddedComplexity = 20 in { + // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS + def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), + (VMOVHLPSrr VR128:$src1, VR128:$src2)>; + + // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS + def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), + (VMOVHLPSrr VR128:$src1, VR128:$src1)>; + def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), + (VMOVHLPSrr VR128:$src1, VR128:$src1)>; + } +} + +let Predicates = [HasSSE1] in { + // MOVHPS patterns + def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + + // MOVLHPS patterns + let AddedComplexity = 20 in { + def : Pat<(v4f32 (movddup VR128:$src, (undef))), + (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; + def : Pat<(v2i64 (movddup VR128:$src, (undef))), + (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; + + // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS + def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + } + def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + let AddedComplexity = 20 in { + // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS + def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; + + // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS + def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), + (MOVHLPSrr VR128:$src1, VR128:$src1)>; + def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), + (MOVHLPSrr VR128:$src1, VR128:$src1)>; + } } //===----------------------------------------------------------------------===// @@ -4010,22 +4087,6 @@ def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)), (SHUFFLE_get_shuf_imm VR128:$src3))>, Requires<[HasSSE2]>; -let AddedComplexity = 20 in { -// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS -def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; - -// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS -def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), - (MOVHLPSrr VR128:$src1, VR128:$src2)>; - -// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS -def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), - (MOVHLPSrr VR128:$src1, VR128:$src1)>; -def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), - (MOVHLPSrr VR128:$src1, VR128:$src1)>; -} - let AddedComplexity = 20 in { // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))), @@ -6023,20 +6084,6 @@ def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))), def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)), (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; -// Shuffle with MOVLHPS -def : Pat<(X86Movlhps VR128:$src1, - (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), - (MOVHPSrm VR128:$src1, addr:$src2)>; -def : Pat<(X86Movlhps VR128:$src1, - (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), - (MOVHPSrm VR128:$src1, addr:$src2)>; -def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; -def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; -def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; - // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem // is during lowering, where it's not possible to recognize the load fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -6108,8 +6155,8 @@ def : Pat<(X86Movlps VR128:$src1, def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; -def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; +def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 38b2ff3b069..f8522c26951 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -21,8 +21,8 @@ entry: } ; CHECK: vmovd +; CHECK-NEXT: vmovlhps %xmm ; CHECK-NEXT: vinsertf128 $1 -; CHECK-NEXT: vpermilps $0 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 @@ -32,8 +32,8 @@ entry: ret <4 x i64> %vecinit6.i } -; CHECK: vinsertf128 $1 -; CHECK-NEXT: vpermilps $0 +; CHECK: vshufpd $0 +; CHECK-NEXT: vinsertf128 $1 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { entry: %vecinit.i = insertelement <4 x double> undef, double %q, i32 0