From 825b93b2dfb101c8b5e32d6f7199e116d559f625 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 20 Jan 2014 19:35:22 +0000 Subject: [PATCH] [X86] Teach how to combine a vselect into a movss/movsd Add target specific rules for combining vselect dag nodes into movss/movsd when possible. If the vector type of the vselect dag node in input is either MVT::v4i13 or MVT::v4f32, then try to fold according to rules: 1) fold (vselect (build_vector (0, -1, -1, -1)), A, B) -> (movss A, B) 2) fold (vselect (build_vector (-1, 0, 0, 0)), A, B) -> (movss B, A) If the vector type of the vselect dag node in input is either MVT::v2i64 or MVT::v2f64 (and we have SSE2), then try to fold according to rules: 3) fold (vselect (build_vector (0, -1)), A, B) -> (movsd A, B) 4) fold (vselect (build_vector (-1, 0)), A, B) -> (movsd B, A) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199683 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 35 ++++ test/CodeGen/X86/avx-blend.ll | 8 +- test/CodeGen/X86/blend-msb.ll | 13 +- test/CodeGen/X86/sse-scalar-fp-arith-2.ll | 208 ++++++++++++++++++++++ test/CodeGen/X86/sse2-blend.ll | 18 +- test/CodeGen/X86/sse41-blend.ll | 12 +- test/CodeGen/X86/vselect.ll | 88 +++++++++ 7 files changed, 355 insertions(+), 27 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 195f11a8264..82ac9f90826 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17155,6 +17155,41 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Try to fold this VSELECT into a MOVSS/MOVSD + if (N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { + if (VT == MVT::v4i32 || VT == MVT::v4f32 || + (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { + bool CanFold = false; + unsigned NumElems = Cond.getNumOperands(); + SDValue A = LHS; + SDValue B = RHS; + + if (isZero(Cond.getOperand(0))) { + CanFold = true; + + // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) + // fold (vselect <0,-1> -> (movsd A, B) + for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) + CanFold = isAllOnes(Cond.getOperand(i)); + } else if (isAllOnes(Cond.getOperand(0))) { + CanFold = true; + std::swap(A, B); + + // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) + // fold (vselect <-1,0> -> (movsd B, A) + for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) + CanFold = isZero(Cond.getOperand(i)); + } + + if (CanFold) { + if (VT == MVT::v4i32 || VT == MVT::v4f32) + return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); + return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); + } + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll index a98e0761ce3..e9bfce663f6 100644 --- a/test/CodeGen/X86/avx-blend.ll +++ b/test/CodeGen/X86/avx-blend.ll @@ -6,7 +6,7 @@ ;CHECK: vblendvps ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 + %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 ret <4 x float> %vsel } @@ -15,13 +15,13 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { ;CHECK: vblendvps ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { - %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 + %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 ret <4 x i32> %vsel } ;CHECK-LABEL: vsel_double: -;CHECK: vblendvpd +;CHECK: vmovsd ;CHECK: ret define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { %vsel = select <2 x i1> , <2 x double> %v1, <2 x double> %v2 @@ -30,7 +30,7 @@ define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { ;CHECK-LABEL: vsel_i64: -;CHECK: vblendvpd +;CHECK: vmovsd ;CHECK: ret define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { %vsel = select <2 x i1> , <2 x i64> %v1, <2 x i64> %v2 diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll index 0485a42eb7e..6b465963292 100644 --- a/test/CodeGen/X86/blend-msb.ll +++ b/test/CodeGen/X86/blend-msb.ll @@ -1,13 +1,11 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s -; In this test we check that sign-extend of the mask bit is performed by -; shifting the needed bit to the MSB, and not using shl+sra. +; Verify that we produce movss instead of blendvps when possible. ;CHECK-LABEL: vsel_float: -;CHECK: movl $-1 -;CHECK-NEXT: movd -;CHECK-NEXT: blendvps +;CHECK-NOT: blendvps +;CHECK: movss ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -15,9 +13,8 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { } ;CHECK-LABEL: vsel_4xi8: -;CHECK: movl $-1 -;CHECK-NEXT: movd -;CHECK-NEXT: blendvps +;CHECK-NOT: blendvps +;CHECK: movss ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll index 59685993f5d..600ee1b7b1e 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll @@ -213,3 +213,211 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-NOT: movsd ; CHECK: ret + +define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) { + %1 = fadd <4 x float> %a, %b + %2 = select <4 x i1> , <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test3_add_ss +; SSE2: addss %xmm1, %xmm0 +; AVX: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) { + %1 = fsub <4 x float> %a, %b + %2 = select <4 x i1> , <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test3_sub_ss +; SSE2: subss %xmm1, %xmm0 +; AVX: vsubss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) { + %1 = fmul <4 x float> %a, %b + %2 = select <4 x i1> , <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test3_mul_ss +; SSE2: mulss %xmm1, %xmm0 +; AVX: vmulss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) { + %1 = fdiv <4 x float> %a, %b + %2 = select <4 x i1> , <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test3_div_ss +; SSE2: divss %xmm1, %xmm0 +; AVX: vdivss %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) { + %1 = fadd <2 x double> %a, %b + %2 = select <2 x i1> , <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test3_add_sd +; SSE2: addsd %xmm1, %xmm0 +; AVX: vaddsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) { + %1 = fsub <2 x double> %a, %b + %2 = select <2 x i1> , <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test3_sub_sd +; SSE2: subsd %xmm1, %xmm0 +; AVX: vsubsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) { + %1 = fmul <2 x double> %a, %b + %2 = select <2 x i1> , <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test3_mul_sd +; SSE2: mulsd %xmm1, %xmm0 +; AVX: vmulsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) { + %1 = fdiv <2 x double> %a, %b + %2 = select <2 x i1> , <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test3_div_sd +; SSE2: divsd %xmm1, %xmm0 +; AVX: vdivsd %xmm1, %xmm0, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) { + %1 = fadd <4 x float> %b, %a + %2 = select <4 x i1> , <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test4_add_ss +; SSE2: addss %xmm0, %xmm1 +; AVX: vaddss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) { + %1 = fsub <4 x float> %b, %a + %2 = select <4 x i1> , <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test4_sub_ss +; SSE2: subss %xmm0, %xmm1 +; AVX: vsubss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) { + %1 = fmul <4 x float> %b, %a + %2 = select <4 x i1> , <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test4_mul_ss +; SSE2: mulss %xmm0, %xmm1 +; AVX: vmulss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) { + %1 = fdiv <4 x float> %b, %a + %2 = select <4 x i1> , <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +; CHECK-LABEL: test4_div_ss +; SSE2: divss %xmm0, %xmm1 +; AVX: vdivss %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movss +; CHECK: ret + + +define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) { + %1 = fadd <2 x double> %b, %a + %2 = select <2 x i1> , <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test4_add_sd +; SSE2: addsd %xmm0, %xmm1 +; AVX: vaddsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) { + %1 = fsub <2 x double> %b, %a + %2 = select <2 x i1> , <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test4_sub_sd +; SSE2: subsd %xmm0, %xmm1 +; AVX: vsubsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) { + %1 = fmul <2 x double> %b, %a + %2 = select <2 x i1> , <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test4_mul_sd +; SSE2: mulsd %xmm0, %xmm1 +; AVX: vmulsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + + +define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) { + %1 = fdiv <2 x double> %b, %a + %2 = select <2 x i1> , <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +; CHECK-LABEL: test4_div_sd +; SSE2: divsd %xmm0, %xmm1 +; AVX: vdivsd %xmm0, %xmm1, %xmm0 +; CHECK-NOT: movsd +; CHECK: ret + diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll index 968595c383a..c63ff72b480 100644 --- a/test/CodeGen/X86/sse2-blend.ll +++ b/test/CodeGen/X86/sse2-blend.ll @@ -1,22 +1,22 @@ ; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s -; CHECK: vsel_float -; CHECK: xorps +; CHECK-LABEL: vsel_float +; CHECK-NOT: xorps ; CHECK: movss -; CHECK: orps +; CHECK-NOT: orps ; CHECK: ret define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) { %A = load <4 x float>* %v1 %B = load <4 x float>* %v2 - %vsel = select <4 x i1> , <4 x float> %A, <4 x float> %B + %vsel = select <4 x i1> , <4 x float> %A, <4 x float> %B store <4 x float > %vsel, <4 x float>* %v1 ret void } -; CHECK: vsel_i32 -; CHECK: xorps +; CHECK-LABEL: vsel_i32 +; CHECK-NOT: xorps ; CHECK: movss -; CHECK: orps +; CHECK-NOT: orps ; CHECK: ret define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) { %A = load <4 x i32>* %v1 @@ -27,7 +27,7 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) { } ; Without forcing instructions, fall back to the preferred PS domain. -; CHECK: vsel_i64 +; CHECK-LABEL: vsel_i64 ; CHECK: andnps ; CHECK: orps ; CHECK: ret @@ -41,7 +41,7 @@ define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) { } ; Without forcing instructions, fall back to the preferred PS domain. -; CHECK: vsel_double +; CHECK-LABEL: vsel_double ; CHECK: andnps ; CHECK: orps ; CHECK: ret diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll index a32f5de30a6..597852c3690 100644 --- a/test/CodeGen/X86/sse41-blend.ll +++ b/test/CodeGen/X86/sse41-blend.ll @@ -4,7 +4,7 @@ ;CHECK: blendvps ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 + %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 ret <4 x float> %vsel } @@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { ;CHECK: blendvps ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { - %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 + %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 ret <4 x i8> %vsel } @@ -21,7 +21,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ;CHECK: blendvps ;CHECK: ret define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { - %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2 + %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2 ret <4 x i16> %vsel } @@ -30,13 +30,13 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { ;CHECK: blendvps ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { - %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 + %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 ret <4 x i32> %vsel } ;CHECK-LABEL: vsel_double: -;CHECK: blendvpd +;CHECK: movsd ;CHECK: ret define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) { %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 @@ -45,7 +45,7 @@ define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) { ;CHECK-LABEL: vsel_i64: -;CHECK: blendvpd +;CHECK: movsd ;CHECK: ret define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) { %vsel = select <4 x i1> , <4 x i64> %v1, <4 x i64> %v2 diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 36c79838ff2..0cf03fc5d62 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -174,3 +174,91 @@ define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) { ; CHECK-NOT: xorps ; CHECK: ret +define <4 x float> @test18(<4 x float> %a, <4 x float> %b) { + %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b + ret <4 x float> %1 +} +; CHECK-LABEL: test18 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { + %1 = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %1 +} +; CHECK-LABEL: test19 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movss +; CHECK: ret + +define <2 x double> @test20(<2 x double> %a, <2 x double> %b) { + %1 = select <2 x i1> , <2 x double> %a, <2 x double> %b + ret <2 x double> %1 +} +; CHECK-LABEL: test20 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK: ret + +define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { + %1 = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %1 +} +; CHECK-LABEL: test21 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK: ret + +define <4 x float> @test22(<4 x float> %a, <4 x float> %b) { + %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b + ret <4 x float> %1 +} +; CHECK-LABEL: test22 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movss +; CHECK: ret + +define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) { + %1 = select <4 x i1> , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %1 +} +; CHECK-LABEL: test23 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movss +; CHECK: ret + +define <2 x double> @test24(<2 x double> %a, <2 x double> %b) { + %1 = select <2 x i1> , <2 x double> %a, <2 x double> %b + ret <2 x double> %1 +} +; CHECK-LABEL: test24 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK: ret + +define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) { + %1 = select <2 x i1> , <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %1 +} +; CHECK-LABEL: test25 +; CHECK-NOT: psllw +; CHECK-NOT: psraw +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK: ret +