From bf53841cfe3c341ebc0fca102d641c2018855254 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Thu, 11 Apr 2013 05:15:54 +0000 Subject: [PATCH] Optimize vector select from all 0s or all 1s As packed comparisons in AVX/SSE produce all 0s or all 1s in each SIMD lane, vector select could be simplified to AND/OR or removed if one or both values being selected is all 0s or all 1s. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179267 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 45 ++++++++++++++++ test/CodeGen/X86/select-with-and-or.ll | 72 ++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 test/CodeGen/X86/select-with-and-or.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 806e72f2273..3564ce39206 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15787,6 +15787,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. + if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::SETCC) { + + assert(Cond.getValueType().isVector() && + "vector select expects a vector selector!"); + + EVT IntVT = Cond.getValueType(); + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!TValIsAllOnes && !FValIsAllZeros) { + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + if (TValIsAllOnes || FValIsAllZeros) { + SDValue Ret; + + if (TValIsAllOnes && FValIsAllZeros) + Ret = Cond; + else if (TValIsAllOnes) + Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); + else if (FValIsAllZeros) + Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); + + return DAG.getNode(ISD::BITCAST, DL, VT, Ret); + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits diff --git a/test/CodeGen/X86/select-with-and-or.ll b/test/CodeGen/X86/select-with-and-or.ll new file mode 100644 index 00000000000..1ccf30bf208 --- /dev/null +++ b/test/CodeGen/X86/select-with-and-or.ll @@ -0,0 +1,72 @@ +; RUN: opt < %s -O3 | \ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> zeroinitializer + ret <4 x i32> %r +; CHECK: test1 +; CHECK: cmpnle +; CHECK-NEXT: andps +; CHECK: ret +} + +define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> , <4 x i32> %c + ret <4 x i32> %r +; CHECK: test2 +; CHECK: cmpnle +; CHECK-NEXT: orps +; CHECK: ret +} + +define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> %c + ret <4 x i32> %r +; CHECK: test3 +; CHECK: cmple +; CHECK-NEXT: andps +; CHECK: ret +} + +define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> + ret <4 x i32> %r +; CHECK: test4 +; CHECK: cmple +; CHECK-NEXT: orps +; CHECK: ret +} + +define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> , <4 x i32> zeroinitializer + ret <4 x i32> %r +; CHECK: test5 +; CHECK: cmpnle +; CHECK-NEXT: ret +} + +define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %r +; CHECK: test6 +; CHECK: cmple +; CHECK-NEXT: ret +} + +define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) { + %f = fcmp ult <4 x float> %a, %b + %s = sext <4 x i1> %f to <4 x i32> + %l = load <4 x i32>* %p + %r = and <4 x i32> %l, %s + ret <4 x i32> %r +; CHECK: test7 +; CHECK: cmpnle +; CHECK-NEXT: andps +; CHECK: ret +}