diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 045bb933cbb..1e8b27b8d10 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -981,6 +981,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); if (Subtarget->is64Bit()) @@ -8870,6 +8871,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; + case X86ISD::PANDN: return "X86ISD::PANDN"; + case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; + case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; + case X86ISD::PSIGND: return "X86ISD::PSIGND"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -11053,6 +11058,36 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return SDValue(); } + +static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Want to form PANDN nodes, in the hopes of then easily combining them with + // OR and AND nodes to form PBLEND/PSIGN. + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check LHS for vnot + if (N0.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); + + // Check RHS for vnot + if (N1.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); + + return SDValue(); +} + static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -11060,12 +11095,101 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = N->getValueType(0); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) return SDValue(); - // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + + // look for psign/blend + if (Subtarget->hasSSSE3()) { + if (VT == MVT::v2i64) { + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::PANDN) + std::swap(N0, N1); + // or (and (m, x), (pandn m, y)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and PANDN and + if (!Y.getNode()) + return SDValue(); + + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + if (Mask.getOpcode() != ISD::BITCAST || + X.getOpcode() != ISD::BITCAST || + Y.getOpcode() != ISD::BITCAST) + return SDValue(); + + // Look through mask bitcast. + Mask = Mask.getOperand(0); + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. The sra node + // will be an intrinsic. + if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) + return SDValue(); + + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + switch (cast(Mask.getOperand(0))->getZExtValue()) { + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + break; + default: return SDValue(); + } + + // Check that the SRA is all signbits. + SDValue SraC = Mask.getOperand(2); + unsigned SraAmt = cast(SraC)->getZExtValue(); + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + if ((SraAmt + 1) != EltBits) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + X = X.getOperand(0); + Y = Y.getOperand(0); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ + unsigned Opc = 0; + switch (EltBits) { + case 8: Opc = X86ISD::PSIGNB; break; + case 16: Opc = X86ISD::PSIGNW; break; + case 32: Opc = X86ISD::PSIGND; break; + default: break; + } + if (Opc) { + SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); + } + } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41()) + return SDValue(); + + unsigned IID = Intrinsic::x86_sse41_pblendvb; + X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); + Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); + Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8, + DAG.getConstant(IID, MVT::i32), X, Y, Mask); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); + } + } + } + + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) @@ -11116,7 +11240,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0)); } - + return SDValue(); } @@ -11334,6 +11458,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); + case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case X86ISD::FXOR: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 35f63b26df6..5fea01de09d 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -159,7 +159,13 @@ namespace llvm { /// PSHUFB - Shuffle 16 8-bit values within a vector. PSHUFB, - + + /// PANDN - and with not'd value. + PANDN, + + /// PSIGNB/W/D - Copy integer sign. + PSIGNB, PSIGNW, PSIGND, + /// FMAX, FMIN - Floating point max and min. /// FMAX, FMIN, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 70c3d076d50..954dbbe45bc 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -43,6 +43,18 @@ def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; +def X86pandn : SDNode<"X86ISD::PANDN", + SDTypeProfile<1, 2, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignb : SDNode<"X86ISD::PSIGNB", + SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignw : SDNode<"X86ISD::PSIGNW", + SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignd : SDNode<"X86ISD::PSIGND", + SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index c52731ac831..058b79b3fac 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1584,19 +1584,13 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", undef /* dummy */, 1, [ // single r+r - [(set VR128:$dst, (v2i64 (and (xor VR128:$src1, - (bc_v2i64 (v4i32 immAllOnesV))), - VR128:$src2)))], + [(set VR128:$dst, (X86pandn VR128:$src1, VR128:$src2))], // double r+r - [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), - (bc_v2i64 (v2f64 VR128:$src2))))], + [], // single r+m - [(set VR128:$dst, (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)), - (bc_v2i64 (v4i32 immAllOnesV))), - (memopv2i64 addr:$src2))))], + [(set VR128:$dst, (X86pandn VR128:$src1, (memopv2i64 addr:$src2)))], // double r+m - [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), - (memopv2i64 addr:$src2)))]]>; + []]>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Arithmetic Instructions @@ -2536,15 +2530,11 @@ let ExeDomain = SSEPackedInt in { } def PANDNrr : PDI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), - VR128:$src2)))]>; + "pandn\t{$src2, $dst|$dst, $src2}", []>; def PANDNrm : PDI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), - (memopv2i64 addr:$src2))))]>; + "pandn\t{$src2, $dst|$dst, $src2}", []>; } } // Constraints = "$src1 = $dst" @@ -3608,6 +3598,13 @@ def : Pat<(X86pshufb VR128:$src, VR128:$mask), def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>; +def : Pat<(X86psignb VR128:$src1, VR128:$src2), + (PSIGNBrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>; +def : Pat<(X86psignw VR128:$src1, VR128:$src2), + (PSIGNWrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>; +def : Pat<(X86psignd VR128:$src1, VR128:$src2), + (PSIGNDrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>; + //===---------------------------------------------------------------------===// // SSSE3 - Packed Align Instruction Patterns //===---------------------------------------------------------------------===// @@ -3896,27 +3893,6 @@ def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; -// Some special case pandn patterns. -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), - VR128:$src2)), - (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), - VR128:$src2)), - (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), - VR128:$src2)), - (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; - -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), - (memop addr:$src2))), - (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), - (memop addr:$src2))), - (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), - (memop addr:$src2))), - (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; - // vector -> vector casts def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>; diff --git a/test/CodeGen/X86/vec-sign.ll b/test/CodeGen/X86/vec-sign.ll new file mode 100644 index 00000000000..0d7e9e5e455 --- /dev/null +++ b/test/CodeGen/X86/vec-sign.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -mcpu=nehalem | FileCheck %s + +define <4 x i32> @psignd(<4 x i32> %a, <4 x i32> %b) nounwind ssp { +entry: +; CHECK: psignd +; CHECK-NOT: sub + %b.lobit = ashr <4 x i32> %b, + %sub = sub nsw <4 x i32> zeroinitializer, %a + %0 = xor <4 x i32> %b.lobit, + %1 = and <4 x i32> %a, %0 + %2 = and <4 x i32> %b.lobit, %sub + %cond = or <4 x i32> %1, %2 + ret <4 x i32> %cond +} + +define <4 x i32> @pblendvb(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) nounwind ssp { +entry: +; CHECK: pblendvb + %b.lobit = ashr <4 x i32> %b, + %sub = sub nsw <4 x i32> zeroinitializer, %a + %0 = xor <4 x i32> %b.lobit, + %1 = and <4 x i32> %c, %0 + %2 = and <4 x i32> %a, %b.lobit + %cond = or <4 x i32> %1, %2 + ret <4 x i32> %cond +}