AVX-512: optimized a shuffle pattern to VINSERTI64x4.

Added intrinsics for VPERMT2PS/PD/D/Q instructions.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207513 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2014-04-29 09:09:15 +00:00
parent c5e41aed09
commit e3e08acd09
5 changed files with 93 additions and 1 deletions

View File

@ -1129,6 +1129,27 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
GCCBuiltin<"__builtin_ia32_vperm2f128_si256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vpermt_d_512:
GCCBuiltin<"__builtin_ia32_vpermt2vard512_mask">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vpermt_q_512:
GCCBuiltin<"__builtin_ia32_vpermt2varq512_mask">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vpermt_ps_512:
GCCBuiltin<"__builtin_ia32_vpermt2varps512_mask">,
Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty,
llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vpermt_pd_512:
GCCBuiltin<"__builtin_ia32_vpermt2varpd512_mask">,
Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty,
llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
}
// Vector blend

View File

@ -4172,6 +4172,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
return true;
}
// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
if (!VT.is512BitVector())
return false;
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfSize = NumElts/2;
if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
*Imm = 1;
return true;
}
}
if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
*Imm = 0;
return true;
}
}
return false;
}
/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
@ -7755,6 +7778,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
getShuffleSHUFImmediate(SVOp), DAG);
}
unsigned Idx;
if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
Idx*(NumElems/2), DAG, dl);
// Handle VPERM2F128/VPERM2I128 permutations
if (isVPERM2X128Mask(M, VT, HasFp256))
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,

View File

@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512me
X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem,
X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
(v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
(VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
(v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
(VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
(v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
(VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
(v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
(VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
//

View File

@ -536,4 +536,12 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
ret void
}
declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
%res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 -1)
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)

View File

@ -231,3 +231,22 @@ define <16 x i32> @test27(<4 x i32>%a) {
%res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i32> %res
}
; CHECK-LABEL: @test28
; CHECK: vinserti64x4 $1
; CHECK: ret
define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
%res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
ret <16 x i32> %res
}
; CHECK-LABEL: @test29
; CHECK: vinserti64x4 $0
; CHECK: ret
define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
%res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i32> %res
}