From df5d431084c5f5adb3cfa41eb1cb852bbce0a61c Mon Sep 17 00:00:00 2001 From: Adam Nemet Date: Wed, 2 Jul 2014 21:25:54 +0000 Subject: [PATCH] [X86] AVX512: Add writemask variants for vperm*2* This includes assembler and codegen support (see the new tests in avx512-encodings.s and avx512-shuffle.ll). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212221 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 82 +++++++++++++++++++++++++----- test/CodeGen/X86/avx512-shuffle.ll | 62 ++++++++++++++++++++++ test/MC/X86/avx512-encodings.s | 16 ++++++ 3 files changed, 146 insertions(+), 14 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 02df80e5666..daff30bea45 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -609,7 +609,7 @@ defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, // -- VPERM2I - 3 source operands form -- multiclass avx512_perm_3src opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, - SDNode OpNode, ValueType OpVT> { + SDNode OpNode, ValueType OpVT, RegisterClass KRC> { let Constraints = "$src1 = $dst" in { def rr : AVX5128I, EVEX_4V; + def rrk : AVX5128I, + EVEX_4V, EVEX_K; + + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def rrkz : AVX5128I, + EVEX_4V, EVEX_KZ; + def rm : AVX5128I, EVEX_4V; + + def rmk : AVX5128I, + EVEX_4V, EVEX_K; + + let AddedComplexity = 10 in // Prefer over the rrkz variant + def rmkz : AVX5128I, + EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, - X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, - X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem, - X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem, - X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, + i512mem, X86VPermiv3, v16i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, + i512mem, X86VPermiv3, v8i64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, + i512mem, X86VPermiv3, v16f32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, + i512mem, X86VPermiv3, v8f64, VK8WM>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; defm VPERMT2D : avx512_perm_3src<0x7E, "vpermt2d", VR512, memopv16i32, i512mem, - X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; + X86VPermv3, v16i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMT2Q : avx512_perm_3src<0x7E, "vpermt2q", VR512, memopv8i64, i512mem, - X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + X86VPermv3, v8i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps", VR512, memopv16f32, i512mem, - X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; + X86VPermv3, v16f32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd", VR512, memopv8f64, i512mem, - X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + X86VPermv3, v8f64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx), (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))), diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index 23ddc3a6c1d..b99e89a9a54 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -56,6 +56,16 @@ define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { ret <8 x double> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test5m: +; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}} +define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + %m = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %res +} + ; CHECK-LABEL: test6: ; CHECK: vpermq $30 ; CHECK: ret @@ -72,6 +82,27 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { ret <8 x i64> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test7m: +; CHECK: vpermt2q {{.* {%k[1-7]} {z}}} +define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + %m = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer + ret <8 x i64> %res +} + +; The mem variant of vpermt2 with a writemask +; CHECK-LABEL: test7mm: +; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}} +define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind { + %b = load <8 x i64>* %pb + %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + %m = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer + ret <8 x i64> %res +} + ; CHECK-LABEL: test8: ; CHECK: vpermt2d ; CHECK: ret @@ -80,6 +111,27 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { ret <16 x i32> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test8m: +; CHECK: vpermt2d {{.* {%k[1-7]} {z}}} +define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %m = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + +; The mem variant of vpermt2 with a writemask +; CHECK-LABEL: test8mm: +; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}} +define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind { + %b = load <16 x i32> * %pb + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + %m = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer + ret <16 x i32> %res +} + ; CHECK-LABEL: test9: ; CHECK: vpermt2ps ; CHECK: ret @@ -88,6 +140,16 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { ret <16 x float> %c } +; The reg variant of vpermt2 with a writemask +; CHECK-LABEL: test9m: +; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z} +define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind { + %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> + %m = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer + ret <16 x float> %res +} + ; CHECK-LABEL: test10: ; CHECK: vpermt2ps ( ; CHECK: ret diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s index e83cf02df4c..b4be41d5d74 100644 --- a/test/MC/X86/avx512-encodings.s +++ b/test/MC/X86/avx512-encodings.s @@ -3199,3 +3199,19 @@ vpcmpd $1, %zmm24, %zmm7, %k5{%k4} // CHECK: vpcmpuq $2, // CHECK: encoding: [0x62,0xf3,0xf5,0x47,0x1e,0x72,0x01,0x02] vpcmpuq $2, 0x40(%rdx), %zmm17, %k6{%k7} + +// CHECK: vpermi2d +// CHECK: encoding: [0x62,0x42,0x6d,0x4b,0x76,0xd6] +vpermi2d %zmm14, %zmm2, %zmm26 {%k3} + +// CHECK: vpermt2pd +// CHECK: encoding: [0x62,0xf2,0xcd,0xc6,0x7f,0xf3] +vpermt2pd %zmm3, %zmm22, %zmm6 {%k6} {z} + +// CHECK: vpermi2q +// CHECK: encoding: [0x62,0x62,0xed,0x4b,0x76,0x54,0x58,0x02] +vpermi2q 0x80(%rax,%rbx,2), %zmm2, %zmm26 {%k3} + +// CHECK: vpermt2d +// CHECK: encoding: [0x62,0x32,0x4d,0xc2,0x7e,0x24,0xad,0x05,0x00,0x00,0x00] +vpermt2d 5(,%r13,4), %zmm22, %zmm12 {%k2} {z}