diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index 35c50118389..cc263e49d62 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -744,6 +744,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_pcmpeqq : GCCBuiltin<"__builtin_ia32_pcmpeqq">, Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem, Commutative]>; + def int_x86_sse42_pcmpgtq : GCCBuiltin<"__builtin_ia32_pcmpgtq">, + Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [IntrNoMem]>; def int_x86_sse41_pmaxsb : GCCBuiltin<"__builtin_ia32_pmaxsb128">, Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1c93477ec39..8bfa574fb79 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -620,7 +620,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::VSETCC, MVT::v4f32, Legal); + setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); } if (Subtarget->hasSSE2()) { @@ -646,11 +646,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); - setOperationAction(ISD::VSETCC, MVT::v2f64, Legal); - setOperationAction(ISD::VSETCC, MVT::v16i8, Legal); - setOperationAction(ISD::VSETCC, MVT::v8i16, Legal); - setOperationAction(ISD::VSETCC, MVT::v4i32, Legal); - setOperationAction(ISD::VSETCC, MVT::v2i64, Legal); + setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); + setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); + setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); + setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); @@ -728,6 +727,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } + if (Subtarget->hasSSE42()) { + setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -4685,6 +4688,113 @@ SDOperand X86TargetLowering::LowerSETCC(SDOperand Op, SelectionDAG &DAG) { } } +SDOperand X86TargetLowering::LowerVSETCC(SDOperand Op, SelectionDAG &DAG) { + SDOperand Cond; + SDOperand Op0 = Op.getOperand(0); + SDOperand Op1 = Op.getOperand(1); + SDOperand CC = Op.getOperand(2); + MVT VT = Op.getValueType(); + ISD::CondCode SetCCOpcode = cast(CC)->get(); + bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); + + if (isFP) { + unsigned SSECC = 8; + unsigned Opc = Op0.getValueType() == MVT::v4f32 ? X86ISD::CMPPS : + X86ISD::CMPPD; + bool Swap = false; + + switch (SetCCOpcode) { + default: break; + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETONE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + } + if (Swap) + std::swap(Op0, Op1); + + // In the one special case we can't handle, emit two comparisons. + if (SSECC == 8) { + SDOperand UNORD, EQ; + + assert(SetCCOpcode == ISD::SETUEQ && "Illegal FP comparison"); + + UNORD = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); + EQ = DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); + return DAG.getNode(ISD::OR, VT, UNORD, EQ); + } + // Handle all other FP comparisons here. + return DAG.getNode(Opc, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); + } + + // We are handling one of the integer comparisons here. Since SSE only has + // GT and EQ comparisons for integer, swapping operands and multiple + // operations may be required for some comparisons. + unsigned Opc = 0, EQOpc = 0, GTOpc = 0; + bool Swap = false, Invert = false, FlipSigns = false; + + switch (VT.getSimpleVT()) { + default: break; + case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; + case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; + case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; + case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; + } + + switch (SetCCOpcode) { + default: break; + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = EQOpc; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = GTOpc; break; + case ISD::SETGE: Swap = true; + case ISD::SETLE: Opc = GTOpc; Invert = true; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; + case ISD::SETUGE: Swap = true; + case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; + } + if (Swap) + std::swap(Op0, Op1); + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. + if (FlipSigns) { + MVT EltVT = VT.getVectorElementType(); + SDOperand SignBit = DAG.getConstant(EltVT.getIntegerVTSignBit(), EltVT); + std::vector SignBits(VT.getVectorNumElements(), SignBit); + SDOperand SignVec = DAG.getNode(ISD::BUILD_VECTOR, VT, &SignBits[0], + SignBits.size()); + Op0 = DAG.getNode(ISD::XOR, VT, Op0, SignVec); + Op1 = DAG.getNode(ISD::XOR, VT, Op1, SignVec); + } + + SDOperand Result = DAG.getNode(Opc, VT, Op0, Op1); + + // If the logical-not of the result is required, perform that now. + if (Invert) { + MVT EltVT = VT.getVectorElementType(); + SDOperand NegOne = DAG.getConstant(EltVT.getIntegerVTBitMask(), EltVT); + std::vector NegOnes(VT.getVectorNumElements(), NegOne); + SDOperand NegOneV = DAG.getNode(ISD::BUILD_VECTOR, VT, &NegOnes[0], + NegOnes.size()); + Result = DAG.getNode(ISD::XOR, VT, Result, NegOneV); + } + return Result; +} SDOperand X86TargetLowering::LowerSELECT(SDOperand Op, SelectionDAG &DAG) { bool addTest = true; @@ -5728,6 +5838,7 @@ SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::VSETCC: return LowerVSETCC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -5819,6 +5930,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; + case X86ISD::CMPPD: return "X86ISD::CMPPD"; + case X86ISD::CMPPS: return "X86ISD::CMPPS"; + case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; + case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; + case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; + case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; + case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; + case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; + case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; + case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 648fe3bb1d2..125a30dd66a 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -208,7 +208,14 @@ namespace llvm { VZEXT_LOAD, // VSHL, VSRL - Vector logical left / right shift. - VSHL, VSRL + VSHL, VSRL, + + // CMPPD, CMPPS - Vector double/float comparison. + CMPPD, CMPPS, + + // PCMP* - Vector integer comparisons. + PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ, + PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ }; } @@ -521,6 +528,7 @@ namespace llvm { SDOperand LowerFNEG(SDOperand Op, SelectionDAG &DAG); SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG); SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerVSETCC(SDOperand Op, SelectionDAG &DAG); SDOperand LowerSELECT(SDOperand Op, SelectionDAG &DAG); SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG); SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG); diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 8d098f12ae2..d5f0efb988c 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -215,6 +215,12 @@ class SS4AIi8 o, Format F, dag outs, dag ins, string asm, list pattern> : Ii8, TA, Requires<[HasSSE41]>; +// SSE4.2 Instruction Templates: +// +// SS428I - SSE 4.2 instructions with T8 prefix. +class SS428I o, Format F, dag outs, dag ins, string asm, + list pattern> + : I, T8, Requires<[HasSSE42]>; // X86-64 Instruction templates... // diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 16a52024b57..e11967dcbbb 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -20,6 +20,8 @@ def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2> ]>; +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, + SDTCisFP<1>, SDTCisVT<3, i8>]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -53,6 +55,16 @@ def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad]>; def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>; def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>; +def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>; +def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>; +def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>; +def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>; +def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>; +def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -163,22 +175,6 @@ def PSxLDQ_imm : SDNodeXFormgetValue() >> 3); }]>; -def SSE_CC_imm : SDNodeXFormget()) { - default: Val = 0; assert(0 && "Unexpected CondCode"); break; - case ISD::SETOEQ: Val = 0; break; - case ISD::SETOLT: Val = 1; break; - case ISD::SETOLE: Val = 2; break; - case ISD::SETUO: Val = 3; break; - case ISD::SETONE: Val = 4; break; - case ISD::SETOGE: Val = 5; break; - case ISD::SETOGT: Val = 6; break; - case ISD::SETO: Val = 7; break; - } - return getI8Imm(Val); -}]>; - // SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*, // SHUFP* etc. imm. def SHUFFLE_get_shuf_imm : SDNodeXForm; } -def : Pat<(v4i32 (vsetcc (v4f32 VR128:$src1), VR128:$src2, cond:$cc)), - (CMPPSrri VR128:$src1, VR128:$src2, (SSE_CC_imm cond:$cc))>; -def : Pat<(v4i32 (vsetcc (v4f32 VR128:$src1), (memop addr:$src2), cond:$cc)), - (CMPPSrmi VR128:$src1, addr:$src2, (SSE_CC_imm cond:$cc))>; +def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPSrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; // Shuffle and unpack instructions let Constraints = "$src1 = $dst" in { @@ -1725,10 +1721,10 @@ let Constraints = "$src1 = $dst" in { [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1, (memop addr:$src), imm:$cc))]>; } -def : Pat<(v2i64 (vsetcc (v2f64 VR128:$src1), VR128:$src2, cond:$cc)), - (CMPPDrri VR128:$src1, VR128:$src2, (SSE_CC_imm cond:$cc))>; -def : Pat<(v2i64 (vsetcc (v2f64 VR128:$src1), (memop addr:$src2), cond:$cc)), - (CMPPDrmi VR128:$src1, addr:$src2, (SSE_CC_imm cond:$cc))>; +def : Pat<(v2i64 (X86cmppd VR128:$src1, VR128:$src2, imm:$cc)), + (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmppd VR128:$src1, (memop addr:$src2), imm:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; // Shuffle and unpack instructions let Constraints = "$src1 = $dst" in { @@ -1994,30 +1990,30 @@ defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>; defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>; defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>; -def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), VR128:$src2, SETEQ)), +def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)), (PCMPEQBrr VR128:$src1, VR128:$src2)>; -def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), (memop addr:$src2), SETEQ)), +def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))), (PCMPEQBrm VR128:$src1, addr:$src2)>; -def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), VR128:$src2, SETEQ)), +def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)), (PCMPEQWrr VR128:$src1, VR128:$src2)>; -def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), (memop addr:$src2), SETEQ)), +def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))), (PCMPEQWrm VR128:$src1, addr:$src2)>; -def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), VR128:$src2, SETEQ)), +def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)), (PCMPEQDrr VR128:$src1, VR128:$src2)>; -def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), (memop addr:$src2), SETEQ)), +def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))), (PCMPEQDrm VR128:$src1, addr:$src2)>; -def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), VR128:$src2, SETGT)), +def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)), (PCMPGTBrr VR128:$src1, VR128:$src2)>; -def : Pat<(v16i8 (vsetcc (v16i8 VR128:$src1), (memop addr:$src2), SETGT)), +def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))), (PCMPGTBrm VR128:$src1, addr:$src2)>; -def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), VR128:$src2, SETGT)), +def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)), (PCMPGTWrr VR128:$src1, VR128:$src2)>; -def : Pat<(v8i16 (vsetcc (v8i16 VR128:$src1), (memop addr:$src2), SETGT)), +def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))), (PCMPGTWrm VR128:$src1, addr:$src2)>; -def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), VR128:$src2, SETGT)), +def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)), (PCMPGTDrr VR128:$src1, VR128:$src2)>; -def : Pat<(v4i32 (vsetcc (v4i32 VR128:$src1), (memop addr:$src2), SETGT)), +def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), (PCMPGTDrm VR128:$src1, addr:$src2)>; @@ -3258,6 +3254,11 @@ defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud", defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw", int_x86_sse41_pmaxuw, 1>; +def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)), + (PCMPEQQrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))), + (PCMPEQQrm VR128:$src1, addr:$src2)>; + /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator let Constraints = "$src1 = $dst" in { @@ -3555,3 +3556,30 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2), def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; + +/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator +let Constraints = "$src1 = $dst" in { + multiclass SS42I_binop_rm_int opc, string OpcodeStr, + Intrinsic IntId128, bit Commutable = 0> { + def rr : SS428I, + OpSize { + let isCommutable = Commutable; + } + def rm : SS428I, OpSize; + } +} + +defm PCMPGTQ : SS41I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>; + +def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)), + (PCMPGTQrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), + (PCMPGTQrm VR128:$src1, addr:$src2)>;