diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8bf72d15e8c..b2eb0432e4c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3713,7 +3713,7 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. +/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. static bool isVPERMILPMask(ArrayRef Mask, EVT VT, bool HasAVX) { if (!HasAVX) return false; @@ -6467,6 +6467,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); + if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); @@ -6636,9 +6639,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT, HasAVX)) + if (isVPERMILPMask(M, VT, HasAVX)) { + if (HasAVX2 && VT == MVT::v8i32) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, + X86::getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, X86::getShuffleSHUFImmediate(SVOp), DAG); + } // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasAVX)) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d9a599c1bda..fb70b9cf0af 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3993,21 +3993,19 @@ def mi : Ii8<0x70, MRMSrcMem, (undef))))]>; } -multiclass sse2_pshuffle_y { +multiclass sse2_pshuffle_y { def Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (vt (pshuf_frag:$src2 VR256:$src1, - (undef))))]>; + [(set VR256:$dst, (vt (OpNode VR256:$src1, (i8 imm:$src2))))]>; def Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (vt (pshuf_frag:$src2 - (bc_frag (memopv4i64 addr:$src1)), - (undef))))]>; + [(set VR256:$dst, + (vt (OpNode (bitconvert (memopv4i64 addr:$src1)), + (i8 imm:$src2))))]>; } } // ExeDomain = SSEPackedInt @@ -4053,17 +4051,9 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { - let AddedComplexity = 5 in - defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, pshufd, bc_v8i32>, TB, - OpSize, VEX; - - // SSE2 with ImmT == Imm8 and XS prefix. - defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, pshufhw, bc_v16i16>, XS, - VEX; - - // SSE2 with ImmT == Imm8 and XD prefix. - defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, pshuflw, bc_v16i16>, XD, - VEX; + defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX; + defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX; + defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX; } let Predicates = [HasSSE2] in { @@ -4225,10 +4215,10 @@ let Predicates = [HasAVX] in { // Splat v2f64 / v2i64 let AddedComplexity = 10 in { - def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), - (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>; + def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), + (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; } //===---------------------------------------------------------------------===// @@ -7200,6 +7190,19 @@ def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)), (VPERMILPSYmi addr:$src1, imm:$imm)>; def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; + +def : Pat<(v4f32 (X86VPermilp VR128:$src1, (i8 imm:$imm))), + (VPERMILPSri VR128:$src1, imm:$imm)>; +def : Pat<(v2f64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), + (VPERMILPDri VR128:$src1, imm:$imm)>; +def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), + (VPERMILPDri VR128:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86VPermilp (memopv4f32 addr:$src1), (i8 imm:$imm))), + (VPERMILPSmi addr:$src1, imm:$imm)>; +def : Pat<(v2f64 (X86VPermilp (memopv2f64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDmi addr:$src1, imm:$imm)>; +def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDmi addr:$src1, imm:$imm)>; } //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll index 5a5c35333f2..947d79f9e4c 100644 --- a/test/CodeGen/X86/avx-shuffle.ll +++ b/test/CodeGen/X86/avx-shuffle.ll @@ -6,7 +6,7 @@ define <4 x float> @test1(<4 x float> %a) nounwind { ret <4 x float> %b ; CHECK: test1: ; CHECK: vshufps -; CHECK: vpshufd +; CHECK: vpermilps } ; rdar://10538417 @@ -98,22 +98,40 @@ define i32 @test10(<4 x i32> %a) nounwind { } define <4 x float> @test11(<4 x float> %a) nounwind { -; CHECK: pshufd $27 +; check: test11 +; check: vpermilps $27 %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> ret <4 x float> %tmp1 } define <4 x float> @test12(<4 x float>* %a) nounwind { -; CHECK: pshufd $27, ( +; CHECK: test12 +; CHECK: vpermilps $27, ( %tmp0 = load <4 x float>* %a %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> ret <4 x float> %tmp1 } -;CHECK: test13 -;CHECK: shufd -;CHECK: ret -define <4 x i32> @test13(<2 x i32>%x) nounwind readnone { +define <4 x i32> @test13(<4 x i32> %a) nounwind { +; check: test13 +; check: vpshufd $27 + %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test14(<4 x i32>* %a) nounwind { +; CHECK: test14 +; CHECK: vpshufd $27, ( + %tmp0 = load <4 x i32>* %a + %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> + ret <4 x i32> %tmp1 +} + +; CHECK: test15 +; CHECK: vpshufd $8 +; CHECK: ret +define <4 x i32> @test15(<2 x i32>%x) nounwind readnone { %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> ret <4 x i32>%x1 } + diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index f8522c26951..94bcddd9759 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -32,7 +32,7 @@ entry: ret <4 x i64> %vecinit6.i } -; CHECK: vshufpd $0 +; CHECK: vpermilpd $0 ; CHECK-NEXT: vinsertf128 $1 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll index 3d521e7cea1..9707cd9b549 100644 --- a/test/CodeGen/X86/avx-vpermil.ll +++ b/test/CodeGen/X86/avx-vpermil.ll @@ -45,7 +45,7 @@ entry: ret <8 x float> %shuffle } -; CHECK-NOT: vpermilps +; CHECK: vpermilps define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32>