diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index e262687e912..145da599a14 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1129,6 +1129,27 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". GCCBuiltin<"__builtin_ia32_vperm2f128_si256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_vpermt_d_512: + GCCBuiltin<"__builtin_ia32_vpermt2vard512_mask">, + Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, + llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_vpermt_q_512: + GCCBuiltin<"__builtin_ia32_vpermt2varq512_mask">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, + llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_vpermt_ps_512: + GCCBuiltin<"__builtin_ia32_vpermt2varps512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, + llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_vpermt_pd_512: + GCCBuiltin<"__builtin_ia32_vpermt2varpd512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty, + llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; + } // Vector blend diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 35474d70d9c..fb941b8d183 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4172,6 +4172,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef Mask, MVT VT, bool HasInt256) { return true; } +// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or +// (src1[0], src0[1]), manipulation with 256-bit sub-vectors +static bool isINSERT64x4Mask(ArrayRef Mask, MVT VT, unsigned int *Imm) { + if (!VT.is512BitVector()) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts/2; + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { + *Imm = 1; + return true; + } + } + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { + *Imm = 0; + return true; + } + } + return false; +} + /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. @@ -7755,6 +7778,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShuffleSHUFImmediate(SVOp), DAG); } + unsigned Idx; + if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) + return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), + Idx*(NumElems/2), DAG, dl); + // Handle VPERM2F128/VPERM2I128 permutations if (isVPERM2X128Mask(M, VT, HasFp256)) return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index d7f05bc9e34..72f5aff569b 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512me X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx), + (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))), + (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>; + +def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx), + (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), + (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>; + +def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx), + (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))), + (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>; + +def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx), + (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), + (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 3fb38edee5c..2c57944861c 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -536,4 +536,12 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) { ret void } -declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 ) \ No newline at end of file +declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 ) + +define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) { +; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 -1) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 59d7010c8d1..23ddc3a6c1d 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -231,3 +231,22 @@ define <16 x i32> @test27(<4 x i32>%a) { %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> ret <16 x i32> %res } + +; CHECK-LABEL: @test28 +; CHECK: vinserti64x4 $1 +; CHECK: ret +define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) { + %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> + ret <16 x i32> %res +} + +; CHECK-LABEL: @test29 +; CHECK: vinserti64x4 $0 +; CHECK: ret +define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) { + %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> + ret <16 x i32> %res +} +