From 4d36bd80e68b8245ba4fcf26d33dbf35da3e2002 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 13 Aug 2013 13:24:07 +0000 Subject: [PATCH] AVX-512: Added CMP and BLEND instructions. Lowering for SETCC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188265 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 84 ++++++++-- lib/Target/X86/X86ISelLowering.h | 7 + lib/Target/X86/X86InstrAVX512.td | 202 ++++++++++++++++++++++++ lib/Target/X86/X86InstrFragmentsSIMD.td | 9 ++ test/CodeGen/X86/avx512-vec-cmp.ll | 113 +++++++++++++ 5 files changed, 406 insertions(+), 9 deletions(-) create mode 100644 test/CodeGen/X86/avx512-vec-cmp.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 55b1f012252..b6e7413fafd 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9705,6 +9705,42 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Cond; + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getValueType().getSimpleVT(); + + EVT OpVT = Op0.getValueType(); + assert(OpVT.getVectorElementType().getSizeInBits() >= 32 && + Op.getValueType().getScalarType() == MVT::i1 && + "Cannot set masked compare for this operation"); + + ISD::CondCode SetCCOpcode = cast(CC)->get(); + SDLoc dl(Op); + + bool Unsigned = false; + unsigned SSECC; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: SSECC = 4; break; + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETUGT: Unsigned = true; + case ISD::SETGT: SSECC = 6; break; // NLE + case ISD::SETULT: Unsigned = true; + case ISD::SETLT: SSECC = 1; break; + case ISD::SETUGE: Unsigned = true; + case ISD::SETGE: SSECC = 5; break; // NLT + case ISD::SETULE: Unsigned = true; + case ISD::SETLE: SSECC = 2; break; + } + unsigned Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); + +} + static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Cond; @@ -9723,7 +9759,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, #endif unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); - + unsigned Opc = X86ISD::CMPP; + unsigned NumElems = VT.getVectorNumElements(); + if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { + assert(NumElems <=16); + Opc = X86ISD::CMPM; + } // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { unsigned CC0, CC1; @@ -9735,14 +9776,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } - SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC0, MVT::i8)); - SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CC1, MVT::i8)); return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. - return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); } @@ -9750,6 +9791,24 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); + bool MaskResult = (VT.getVectorElementType() == MVT::i1); + EVT OpVT = Op1.getValueType(); + if (Subtarget->hasAVX512()) { + if (Op1.getValueType().is512BitVector() || + (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) + return LowerIntVSETCC_AVX512(Op, DAG); + + // In AVX-512 architecture setcc returns mask with i1 elements, + // But there is no compare instruction for i8 and i16 elements. + // We are not talking about 512-bit operands in this case, these + // types are illegal. + if (MaskResult && + (OpVT.getVectorElementType().getSizeInBits() < 32 && + OpVT.getVectorElementType().getSizeInBits() >= 8)) + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -9759,15 +9818,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; + case ISD::SETEQ: Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = X86ISD::PCMPGT; break; + case ISD::SETGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; + case ISD::SETLE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; + case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; + case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; + FlipSigns = true; Invert = true; break; } // Special case: Use min/max operations for SETULE/SETUGE @@ -13201,6 +13263,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMP: return "X86ISD::CMP"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::CMPM: return "X86ISD::CMPM"; + case X86ISD::CMPMU: return "X86ISD::CMPMU"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; @@ -13273,6 +13337,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; + case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; + case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index c931b9b6667..7201f7a8fee 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -274,6 +274,13 @@ namespace llvm { // PCMP* - Vector integer comparisons. PCMPEQ, PCMPGT, + // PCMP*M - Vector integer comparisons, the result is in a mask vector + PCMPEQM, PCMPGTM, + + /// CMPM, CMPMU - Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + CMPM, + CMPMU, // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7fed783b35f..f4528a9b3e0 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -564,7 +564,195 @@ defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512me defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +//===----------------------------------------------------------------------===// +// AVX-512 - BLEND using mask +// +multiclass avx512_blendmask opc, string OpcodeStr, + RegisterClass KRC, RegisterClass RC, + X86MemOperand x86memop, PatFrag mem_frag, + SDNode OpNode, ValueType vt> { + def rr : AVX5128I, EVEX_4V, EVEX_K; + def rm : AVX5128I, + EVEX_4V, EVEX_K; +} + +let ExeDomain = SSEPackedSingle in +defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", VK16WM, VR512, f512mem, + memopv16f32, vselect, v16f32>, + EVEX_CD8<32, CD8VF>, EVEX_V512; +let ExeDomain = SSEPackedDouble in +defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", VK8WM, VR512, f512mem, + memopv8f64, vselect, v8f64>, + VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; + +defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", VK16WM, VR512, f512mem, + memopv8i64, vselect, v16i32>, + EVEX_CD8<32, CD8VF>, EVEX_V512; + +defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", VK8WM, VR512, f512mem, + memopv8i64, vselect, v8i64>, VEX_W, + EVEX_CD8<64, CD8VF>, EVEX_V512; + + +let Predicates = [HasAVX512] in { +def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), + (v8f32 VR256X:$src2))), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), + (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; +} + +multiclass avx512_icmp_packed opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, + SDNode OpNode, ValueType vt> { + def rr : AVX512BI, EVEX_4V; + def rm : AVX512BI, EVEX_4V; +} + +defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, + memopv8i64, X86pcmpeqm, v16i32>, EVEX_V512; +defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, + memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W; + +defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, + memopv8i64, X86pcmpgtm, v16i32>, EVEX_V512; +defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, + memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W; + +def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (VPCMPGTDZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + +def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (VPCMPEQDZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + + +multiclass avx512_icmp_cc opc, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, + SDNode OpNode, ValueType vt, Operand CC, string asm, + string asm_alt> { + def rri : AVX512AIi8, EVEX_4V; + def rmi : AVX512AIi8, EVEX_4V; + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rri_alt : AVX512AIi8, EVEX_4V; + def rmi_alt : AVX512AIi8, EVEX_4V; + } +} + +defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv8i64, + X86cmpm, v16i32, AVXCC, + "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv8i64, + X86cmpmu, v16i32, AVXCC, + "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + EVEX_V512, EVEX_CD8<32, CD8VF>; + +defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64, + X86cmpm, v8i64, AVXCC, + "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64, + X86cmpmu, v8i64, AVXCC, + "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +// avx512_cmp_packed - sse 1 & 2 compare packed instructions +multiclass avx512_cmp_packed { + def rri : AVX512PIi8<0xC2, MRMSrcReg, + (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>; + def rmi : AVX512PIi8<0xC2, MRMSrcMem, + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set KRC:$dst, + (OpNode (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; + + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_CMPP_RR, d>; + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + asm_alt, [], IIC_SSE_CMPP_RM, d>; + } +} + +defm VCMPPSZ : avx512_cmp_packed, TB, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VCMPPDZ : avx512_cmp_packed, TB, OpSize, EVEX_4V, VEX_W, EVEX_V512, + EVEX_CD8<64, CD8VF>; + +def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VCMPPSZrri + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; +def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VPCMPDZrri + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; +def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VPCMPUDZrri + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; + // Mask register copy, including // - copy between mask registers // - load/store mask registers @@ -949,4 +1137,18 @@ defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, memopv16i32, i defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, memopv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +let AddedComplexity = 20 in { +def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), + (v16f32 VR512:$src2))), + (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; +def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), + (v8f64 VR512:$src2))), + (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), + (v16i32 VR512:$src2))), + (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; +def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), + (v8i64 VR512:$src2))), + (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; +} diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 8587d382b2b..3d6370fdac6 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -118,6 +118,15 @@ def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; +def X86IntCmpMask : SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>; +def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>; +def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>; + +def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; + def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVec<2>]>>; diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll new file mode 100644 index 00000000000..ee57af731f6 --- /dev/null +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -0,0 +1,113 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vcmpleps +; CHECK: vmovups +; CHECK: ret +define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { + %mask = fcmp ole <16 x float> %x, %y + %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y + ret <16 x float> %max +} + +; CHECK-LABEL: test2 +; CHECK: vcmplepd +; CHECK: vmovupd +; CHECK: ret +define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { + %mask = fcmp ole <8 x double> %x, %y + %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y + ret <8 x double> %max +} + +; CHECK-LABEL: test3 +; CHECK: vpcmpeqd +; CHECK: vmovdqu32 +; CHECK: ret +define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %y) nounwind { + %mask = icmp eq <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +; CHECK-LABEL: @test4_unsigned +; CHECK: vpcmpnltud +; CHECK: vmovdqu32 +; CHECK: ret +define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind { + %mask = icmp uge <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +; CHECK-LABEL: test5 +; CHECK: vpcmpeqq {{.*}}%k1 +; CHECK: vmovdqu64 {{.*}}%k1 +; CHECK: ret +define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind { + %mask = icmp eq <8 x i64> %x, %y + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y + ret <8 x i64> %max +} + +; CHECK-LABEL: test6_unsigned +; CHECK: vpcmpnleuq {{.*}}%k1 +; CHECK: vmovdqu64 {{.*}}%k1 +; CHECK: ret +define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind { + %mask = icmp ugt <8 x i64> %x, %y + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y + ret <8 x i64> %max +} + +; CHECK-LABEL: test7 +; CHECK: xor +; CHECK: vcmpltps +; CHECK: vblendvps +; CHECK: ret +define <4 x float> @test7(<4 x float> %a, <4 x float> %b) { + %mask = fcmp olt <4 x float> %a, zeroinitializer + %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b + ret <4 x float>%c +} + +; CHECK-LABEL: test8 +; CHECK: xor +; CHECK: vcmpltpd +; CHECK: vblendvpd +; CHECK: ret +define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { + %mask = fcmp olt <2 x double> %a, zeroinitializer + %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b + ret <2 x double>%c +} + +; CHECK-LABEL: test9 +; CHECK: vpcmpeqd +; CHECK: vpblendmd +; CHECK: ret +define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { + %mask = icmp eq <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: test10 +; CHECK: vcmpeqps +; CHECK: vblendmps +; CHECK: ret +define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { + %mask = fcmp oeq <8 x float> %x, %y + %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y + ret <8 x float> %max +} + +; CHECK-LABEL: test11_unsigned +; CHECK: vpcmpnleud %zmm +; CHECK: vpblendmd %zmm +; CHECK: ret +define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind { + %mask = icmp ugt <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %max +}