diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index acccb149596..04a119f055f 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -1398,6 +1398,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; @@ -1407,21 +1410,49 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; +} + +// Vector min, max +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmaxu_b : GCCBuiltin<"__builtin_ia32_pmaxub256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmaxu_w : GCCBuiltin<"__builtin_ia32_pmaxuw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmaxu_d : GCCBuiltin<"__builtin_ia32_pmaxud256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmaxs_b : GCCBuiltin<"__builtin_ia32_pmaxsb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pmaxs_w : GCCBuiltin<"__builtin_ia32_pmaxsw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmaxs_d : GCCBuiltin<"__builtin_ia32_pmaxsd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pminu_b : GCCBuiltin<"__builtin_ia32_pminub256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pminu_w : GCCBuiltin<"__builtin_ia32_pminuw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pminu_d : GCCBuiltin<"__builtin_ia32_pminud256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmins_b : GCCBuiltin<"__builtin_ia32_pminsb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, - Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pmins_d : GCCBuiltin<"__builtin_ia32_pminsd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem, Commutative]>; } // Integer shift ops. @@ -1501,15 +1532,21 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pcmpeq_d : GCCBuiltin<"__builtin_ia32_pcmpeqd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pcmpeq_q : GCCBuiltin<"__builtin_ia32_pcmpeqq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], + [IntrNoMem, Commutative]>; def int_x86_avx2_pcmpgt_b : GCCBuiltin<"__builtin_ia32_pcmpgtb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem]>; + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], + [IntrNoMem]>; def int_x86_avx2_pcmpgt_w : GCCBuiltin<"__builtin_ia32_pcmpgtw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], + [IntrNoMem]>; def int_x86_avx2_pcmpgt_d : GCCBuiltin<"__builtin_ia32_pcmpgtd256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], + [IntrNoMem]>; + def int_x86_avx2_pcmpgt_q : GCCBuiltin<"__builtin_ia32_pcmpgtq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], + [IntrNoMem]>; } // Pack ops. @@ -1523,6 +1560,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem]>; } // Absolute value ops @@ -1620,6 +1660,23 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [IntrNoMem]>; } +// Vector blend +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_pblendw : GCCBuiltin<"__builtin_ia32_pblendw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, + llvm_i32_ty], [IntrNoMem]>; +} + +// Vector load with broadcast +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_vbroadcasti128 : + GCCBuiltin<"__builtin_ia32_vbroadcastsi256">, + Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>; +} + // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">, @@ -1627,6 +1684,11 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty, + llvm_i32_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">, + Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f30a0c4699a..77a9031090d 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6248,6 +6248,22 @@ multiclass SS41I_binop_rm_int opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } +/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator +multiclass SS41I_binop_rm_int_y opc, string OpcodeStr, + Intrinsic IntId256> { + let isCommutable = 1 in + def Yrr : SS48I, OpSize; + def Yrm : SS48I, OpSize; +} + let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, @@ -6279,6 +6295,32 @@ let Predicates = [HasAVX] in { (VPCMPEQQrm VR128:$src1, addr:$src2)>; } +let Predicates = [HasAVX2] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw", + int_x86_avx2_packusdw>, VEX_4V; + defm VPCMPEQQ : SS41I_binop_rm_int_y<0x29, "vpcmpeqq", + int_x86_avx2_pcmpeq_q>, VEX_4V; + defm VPMINSB : SS41I_binop_rm_int_y<0x38, "vpminsb", + int_x86_avx2_pmins_b>, VEX_4V; + defm VPMINSD : SS41I_binop_rm_int_y<0x39, "vpminsd", + int_x86_avx2_pmins_d>, VEX_4V; + defm VPMINUD : SS41I_binop_rm_int_y<0x3B, "vpminud", + int_x86_avx2_pminu_d>, VEX_4V; + defm VPMINUW : SS41I_binop_rm_int_y<0x3A, "vpminuw", + int_x86_avx2_pminu_w>, VEX_4V; + defm VPMAXSB : SS41I_binop_rm_int_y<0x3C, "vpmaxsb", + int_x86_avx2_pmaxs_b>, VEX_4V; + defm VPMAXSD : SS41I_binop_rm_int_y<0x3D, "vpmaxsd", + int_x86_avx2_pmaxs_d>, VEX_4V; + defm VPMAXUD : SS41I_binop_rm_int_y<0x3F, "vpmaxud", + int_x86_avx2_pmaxu_d>, VEX_4V; + defm VPMAXUW : SS41I_binop_rm_int_y<0x3E, "vpmaxuw", + int_x86_avx2_pmaxu_w>, VEX_4V; + defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", + int_x86_avx2_pmul_dq>, VEX_4V; +} + let Constraints = "$src1 = $dst" in { let isCommutable = 0 in defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; @@ -6301,7 +6343,7 @@ def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))), /// SS48I_binop_rm - Simple SSE41 binary operator. multiclass SS48I_binop_rm opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, bit Is2Addr = 1> { + ValueType OpVT, bit Is2Addr = 1> { let isCommutable = 1 in def rr : SS48I opc, string OpcodeStr, SDNode OpNode, OpSize; } +/// SS48I_binop_rm - Simple SSE41 binary operator. +multiclass SS48I_binop_rm_y opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT> { + let isCommutable = 1 in + def Yrr : SS48I, + OpSize; + def Yrm : SS48I, + OpSize; +} + let Predicates = [HasAVX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; +let Predicates = [HasAVX2] in + defm VPMULLD : SS48I_binop_rm_y<0x40, "vpmulld", mul, v8i32>, VEX_4V; let Constraints = "$src1 = $dst" in defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; @@ -6375,6 +6436,15 @@ let Predicates = [HasAVX] in { VR256, memopv32i8, i256mem, 0>, VEX_4V; } +let Predicates = [HasAVX2] in { + let isCommutable = 0 in { + defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, + VR256, memopv32i8, i256mem, 0>, VEX_4V; + defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, + VR256, memopv32i8, i256mem, 0>, VEX_4V; + } +} + let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, @@ -6393,7 +6463,6 @@ let Constraints = "$src1 = $dst" in { } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -let Predicates = [HasAVX] in { multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId> { @@ -6413,8 +6482,8 @@ multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, RC:$src3))], SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; } -} +let Predicates = [HasAVX] in { defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem, memopv16i8, int_x86_sse41_blendvpd>; defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem, @@ -6425,6 +6494,12 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem, memopv32i8, int_x86_avx_blendv_pd_256>; defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem, memopv32i8, int_x86_avx_blendv_ps_256>; +} + +let Predicates = [HasAVX2] in { +defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, + memopv32i8, int_x86_avx2_pblendvb>; +} let Predicates = [HasAVX] in { def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), @@ -6503,6 +6578,11 @@ def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, OpSize, VEX; +let Predicates = [HasAVX2] in +def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, + OpSize, VEX; def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -6532,6 +6612,22 @@ multiclass SS42I_binop_rm_int opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } +/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator +multiclass SS42I_binop_rm_int_y opc, string OpcodeStr, + Intrinsic IntId256> { + def Yrr : SS428I, + OpSize; + def Yrm : SS428I, OpSize; +} + let Predicates = [HasAVX] in { defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, 0>, VEX_4V; @@ -6542,6 +6638,11 @@ let Predicates = [HasAVX] in { (VPCMPGTQrm VR128:$src1, addr:$src2)>; } +let Predicates = [HasAVX2] in { + defm VPCMPGTQ : SS42I_binop_rm_int_y<0x37, "vpcmpgtq", int_x86_avx2_pcmpgt_q>, + VEX_4V; +} + let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>; @@ -6991,6 +7092,10 @@ def VBROADCASTSD : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256>; +let Predicates = [HasAVX2] in +def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, + int_x86_avx2_vbroadcasti128>; + def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 2dc0f5ce2e5..8aef255cae5 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -512,6 +512,119 @@ define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) { declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone +define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) { + ; CHECK: movl + ; CHECK: vmovntdqa + %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly + + +define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vmpsadbw + %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone + + +define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpackusdw + %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) { + ; CHECK: vpblendvb + %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpblendw + %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) { + ; CHECK: vpcmpeqq + %res = call <4 x i64> @llvm.x86.avx2.pcmpeq.q(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pcmpeq.q(<4 x i64>, <4 x i64>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vpmaxsb + %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpmaxsd + %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpmaxud + %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpmaxuw + %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vpminsb + %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpminsd + %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpminud + %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpminuw + %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone + + define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) { ; CHECK: vpmovsxbd %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] @@ -606,3 +719,27 @@ define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) { ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpmuldq + %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<2 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) { + ; CHECK: vpcmpgtq + %res = call <4 x i64> @llvm.x86.avx2.pcmpgt.q(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pcmpgt.q(<4 x i64>, <4 x i64>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_vbroadcasti128(i8* %a0) { + ; CHECK: vbroadcasti128 + %res = call <4 x i64> @llvm.x86.avx2.vbroadcasti128(i8* %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.vbroadcasti128(i8*) nounwind readonly