diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b9f53bde26b..e443dccc1c1 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1484,6 +1484,34 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1, if (N1.getOpcode() == ISD::UNDEF) commuteShuffle(N1, N2, MaskVec); + // If shuffling a splat, try to blend the splat instead. We do this here so + // that even when this arises during lowering we don't have to re-handle it. + auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) { + BitVector UndefElements; + SDValue Splat = BV->getSplatValue(&UndefElements); + if (!Splat) + return; + + for (int i = 0; i < (int)NElts; ++i) { + if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + (int)NElts)) + continue; + + // If this input comes from undef, mark it as such. + if (UndefElements[MaskVec[i] - Offset]) { + MaskVec[i] = -1; + continue; + } + + // If we can blend a non-undef lane, use that instead. + if (!UndefElements[i]) + MaskVec[i] = i + Offset; + } + }; + if (auto *N1BV = dyn_cast(N1)) + BlendSplat(N1BV, 0); + if (auto *N2BV = dyn_cast(N2)) + BlendSplat(N2BV, NElts); + // Canonicalize all index into lhs, -> shuffle lhs, undef // Canonicalize all index into rhs, -> shuffle rhs, undef bool AllLHS = true, AllRHS = true; diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll index 62fd96102d0..75e0d80e7f4 100644 --- a/test/CodeGen/AArch64/arm64-vshuffle.ll +++ b/test/CodeGen/AArch64/arm64-vshuffle.ll @@ -29,14 +29,14 @@ entry: } ; CHECK: lCPI1_0: -; CHECK: .byte 2 ; 0x2 +; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 +; CHECK: .byte 2 ; 0x2 ; CHECK: .byte 255 ; 0xff ; CHECK: .byte 10 ; 0xa ; CHECK: .byte 12 ; 0xc ; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 7 ; 0x7 ; CHECK: test2 ; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF] ; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_1@PAGE @@ -82,22 +82,22 @@ bb: ret <16 x i1> %Shuff } ; CHECK: lCPI3_1: -; CHECK: .byte 2 ; 0x2 +; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 18 ; 0x12 -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 31 ; 0x1f +; CHECK: .byte 18 ; 0x12 +; CHECK: .byte 4 ; 0x4 +; CHECK: .byte 5 ; 0x5 ; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 30 ; 0x1e +; CHECK: .byte 7 ; 0x7 +; CHECK: .byte 8 ; 0x8 +; CHECK: .byte 31 ; 0x1f ; CHECK: .byte 10 ; 0xa +; CHECK: .byte 30 ; 0x1e ; CHECK: .byte 12 ; 0xc +; CHECK: .byte 13 ; 0xd ; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 15 ; 0xf ; CHECK: _test4: ; CHECK: ldr q[[REG1:[0-9]+]] ; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000 diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 4bce1cf394f..ac1d5ef7092 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -237,7 +237,7 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test19: ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,3] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,3] ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; CHECK-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,2] ; CHECK-NEXT: orps %xmm1, %xmm2 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 4979b6ff767..5b72d797835 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -964,15 +964,15 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] -; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X32-NEXT: movapd %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: insertps_with_undefs: ; X64: ## BB#0: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] -; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: movapd %xmm1, %xmm0 ; X64-NEXT: retq %1 = load float* %b, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 4f5f9f0607c..b77a1b5f493 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -25,8 +25,8 @@ define <4 x float> @t2(<4 x float>* %P) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movaps (%eax), %xmm1 ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; CHECK-NEXT: retl %tmp1 = load <4 x float>* %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > @@ -37,9 +37,9 @@ define <4 x float> @t3(<4 x float>* %P) nounwind { ; CHECK-LABEL: t3: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movaps (%eax), %xmm0 -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0] +; CHECK-NEXT: movapd (%eax), %xmm0 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-NEXT: retl %tmp1 = load <4 x float>* %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > @@ -52,8 +52,8 @@ define <4 x float> @t4(<4 x float>* %P) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movaps (%eax), %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; CHECK-NEXT: retl %tmp1 = load <4 x float>* %P %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 72ffb297c04..85b19698f46 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -478,22 +478,22 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { ; SSE2-LABEL: shuffle_v4f32_z4zz: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_z4zz: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_z4zz: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4f32_z4zz: @@ -513,24 +513,24 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { ; SSE2-LABEL: shuffle_v4f32_zz4z: ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_zz4z: ; SSE3: # BB#0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_zz4z: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -658,37 +658,77 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { } define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) { -; SSE-LABEL: shuffle_v4f32_0z23: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4f32_0z23: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_0z23: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_0z23: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_0z23: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0z23: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) { -; SSE-LABEL: shuffle_v4f32_01z3: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4f32_01z3: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_01z3: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_01z3: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_01z3: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_01z3: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle @@ -732,18 +772,37 @@ define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) { } define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) { -; SSE-LABEL: shuffle_v4f32_0zz3: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4f32_0zz3: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_0zz3: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_0zz3: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_0zz3: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0zz3: ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle @@ -1183,29 +1242,75 @@ define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) { } define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_0z23: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_0z23: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0z23: -; AVX: # BB#0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-LABEL: shuffle_v4i32_0z23: +; SSE3: # BB#0: +; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0z23: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0z23: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v4i32_0z23: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_0z23: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_01z3: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_01z3: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_01z3: -; AVX: # BB#0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-LABEL: shuffle_v4i32_01z3: +; SSE3: # BB#0: +; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_01z3: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_01z3: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v4i32_01z3: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_01z3: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1248,15 +1353,38 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) { } define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) { -; SSE-LABEL: shuffle_v4i32_0zz3: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_0zz3: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: shuffle_v4i32_0zz3: -; AVX: # BB#0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-LABEL: shuffle_v4i32_0zz3: +; SSE3: # BB#0: +; SSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0zz3: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0zz3: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v4i32_0zz3: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_0zz3: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 71348b1b2a9..e73e6cadd79 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2074,14 +2074,26 @@ define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) { } define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) { -; SSE-LABEL: shuffle_v8i16_0zzzz5z7: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0zzzz5z7: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0zzzz5z7: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0zzzz5z7: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_0zzzz5z7: ; AVX: # BB#0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index c2e06f5fcfa..f4ce8f526fb 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1421,10 +1421,9 @@ define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_z define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: @@ -1441,7 +1440,7 @@ define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_z ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,2,3,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,14,15,0,1,2,3,12,13,14,15] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 9905dfdac28..2b8d80aa755 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1802,9 +1802,8 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_ define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 8d13ae45400..2fda3cf457a 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1856,8 +1856,8 @@ define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) { ; AVX1-LABEL: shuffle_v8i32_z0U2zUz6: ; AVX1: # BB#0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_z0U2zUz6: @@ -1872,8 +1872,8 @@ define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) { ; AVX1-LABEL: shuffle_v8i32_1U3z5zUU: ; AVX1: # BB#0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_1U3z5zUU: diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 613f7c66794..a22dc6b7126 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -547,14 +547,14 @@ define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2: # BB#0: ; SSE2-NEXT: xorps %xmm1, %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test3c: ; SSSE3: # BB#0: ; SSSE3-NEXT: xorps %xmm1, %xmm0 ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test3c: @@ -617,7 +617,7 @@ define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE: # BB#0: ; SSE-NEXT: xorps %xmm1, %xmm0 ; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -625,7 +625,7 @@ define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; AVX: # BB#0: ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll index 2dbcaa1e582..2d047369fd3 100644 --- a/test/CodeGen/X86/vector-shuffle-sse1.ll +++ b/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -106,8 +106,8 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { ; SSE1-LABEL: shuffle_v4f32_z4zz: ; SSE1: # BB#0: ; SSE1-NEXT: xorps %xmm1, %xmm1 -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE1-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle @@ -117,8 +117,8 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { ; SSE1-LABEL: shuffle_v4f32_zz4z: ; SSE1: # BB#0: ; SSE1-NEXT: xorps %xmm1, %xmm1 -; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE1-NEXT: movaps %xmm1, %xmm0 ; SSE1-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> @@ -193,7 +193,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE1-NEXT: xorps %xmm2, %xmm2 -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE1-NEXT: movaps %xmm1, %xmm0 ; SSE1-NEXT: retq @@ -214,7 +214,7 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { ; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE1-NEXT: xorps %xmm2, %xmm2 -; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] ; SSE1-NEXT: retq %a = load <2 x float>* %ptr