AVX-512: optimized a shuffle pattern to VINSERTI64x4.

Added intrinsics for VPERMT2PS/PD/D/Q instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207513 91177308-0d34-0410-b5e6-96231b3b80d8
2025-06-04 18:46:23 +00:00 · 2014-04-29 09:09:15 +00:00 · 2014-04-29 09:09:15 +00:00 · e3e08acd09
commit e3e08acd09
parent c5e41aed09
5 changed files with 93 additions and 1 deletions
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -1129,6 +1129,27 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
        GCCBuiltin<"__builtin_ia32_vperm2f128_si256">,
        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                  llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_d_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2vard512_mask">,
+        Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                  llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_q_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2varq512_mask">,
+        Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                  llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_ps_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2varps512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty,
+                  llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_pd_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2varpd512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty,
+                  llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
+
 }

 // Vector blend
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -4172,6 +4172,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
  return true;
 }

+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+  if (!VT.is512BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts/2;
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+      *Imm = 1;
+      return true;
+    }
+  }
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+      *Imm = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSS,
 /// MOVSD, and MOVD, i.e. setting the lowest element.
@ -7755,6 +7778,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                getShuffleSHUFImmediate(SVOp), DAG);
  }

+  unsigned Idx;
+  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+                              Idx*(NumElems/2), DAG, dl);
+
  // Handle VPERM2F128/VPERM2I128 permutations
  if (isVPERM2X128Mask(M, VT, HasFp256))
    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps",  VR512, memopv16f32, i512me
                               X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd",  VR512, memopv8f64, i512mem, 
                               X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
+                   (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
+          (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
+                   (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
+          (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
+                   (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
+          (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
+                   (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
+          (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@ -536,4 +536,12 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
  ret void
 }

-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+
+define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
+; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 -1)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@ -231,3 +231,22 @@ define <16 x i32> @test27(<4 x i32>%a) {
 %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ret <16 x i32> %res
 }
+
+; CHECK-LABEL: @test28
+; CHECK: vinserti64x4 $1
+; CHECK: ret
+define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                              i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i32> %res
+}
+
+; CHECK-LABEL: @test29
+; CHECK: vinserti64x4 $0
+; CHECK: ret
+define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i32> %res
+}
+