From cb8fe113a34cf0d4ac7ae63d9280992aa66b076b Mon Sep 17 00:00:00 2001 From: Igor Breger Date: Thu, 23 Jul 2015 07:39:21 +0000 Subject: [PATCH] AVX-512: Implemented encoding , DAG lowering and intrinsics for Integer Truncate with/without saturation Added tests for DAG lowering ,encoding and intrinsic Differential Revision: http://reviews.llvm.org/D11218 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242990 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 544 ++++++++ include/llvm/Target/TargetSelectionDAG.td | 53 +- lib/Target/X86/X86ISelLowering.cpp | 132 +- lib/Target/X86/X86ISelLowering.h | 5 +- lib/Target/X86/X86InstrAVX512.td | 273 +++- lib/Target/X86/X86InstrFragmentsSIMD.td | 18 +- lib/Target/X86/X86IntrinsicsInfo.h | 148 +- .../{avx512-trunc-ext.ll => avx512-ext.ll} | 26 +- test/CodeGen/X86/avx512-intrinsics.ll | 390 ++++++ test/CodeGen/X86/avx512-trunc.ll | 364 +++++ test/CodeGen/X86/avx512bw-intrinsics.ll | 78 ++ test/CodeGen/X86/avx512bwvl-intrinsics.ll | 156 +++ test/CodeGen/X86/avx512vl-intrinsics.ll | 780 +++++++++++ test/CodeGen/X86/masked_memop.ll | 11 +- test/MC/X86/x86-64-avx512bw.s | 120 ++ test/MC/X86/x86-64-avx512bw_vl.s | 480 +++++++ test/MC/X86/x86-64-avx512f_vl.s | 1200 +++++++++++++++++ 17 files changed, 4651 insertions(+), 127 deletions(-) rename test/CodeGen/X86/{avx512-trunc-ext.ll => avx512-ext.ll} (98%) create mode 100644 test/CodeGen/X86/avx512-trunc.ll diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 352a592bbd8..7362c409c84 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -5816,6 +5816,550 @@ let TargetPrefix = "x86" in { llvm_i8_ty], [IntrReadArgMem]>; } + +// truncate +let TargetPrefix = "x86" in { + def int_x86_avx512_mask_pmov_qb_128 : + GCCBuiltin<"__builtin_ia32_pmovqb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovqb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qb_128 : + GCCBuiltin<"__builtin_ia32_pmovsqb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsqb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qb_128 : + GCCBuiltin<"__builtin_ia32_pmovusqb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusqb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qb_256 : + GCCBuiltin<"__builtin_ia32_pmovqb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovqb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qb_256 : + GCCBuiltin<"__builtin_ia32_pmovsqb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsqb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qb_256 : + GCCBuiltin<"__builtin_ia32_pmovusqb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusqb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qb_512 : + GCCBuiltin<"__builtin_ia32_pmovqb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovqb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qb_512 : + GCCBuiltin<"__builtin_ia32_pmovsqb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsqb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qb_512 : + GCCBuiltin<"__builtin_ia32_pmovusqb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusqb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qw_128 : + GCCBuiltin<"__builtin_ia32_pmovqw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovqw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qw_128 : + GCCBuiltin<"__builtin_ia32_pmovsqw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsqw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qw_128 : + GCCBuiltin<"__builtin_ia32_pmovusqw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusqw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qw_256 : + GCCBuiltin<"__builtin_ia32_pmovqw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovqw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qw_256 : + GCCBuiltin<"__builtin_ia32_pmovsqw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsqw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qw_256 : + GCCBuiltin<"__builtin_ia32_pmovusqw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusqw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qw_512 : + GCCBuiltin<"__builtin_ia32_pmovqw512_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovqw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qw_512 : + GCCBuiltin<"__builtin_ia32_pmovsqw512_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsqw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qw_512 : + GCCBuiltin<"__builtin_ia32_pmovusqw512_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusqw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qd_128 : + GCCBuiltin<"__builtin_ia32_pmovqd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qd_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovqd128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qd_128 : + GCCBuiltin<"__builtin_ia32_pmovsqd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qd_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsqd128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qd_128 : + GCCBuiltin<"__builtin_ia32_pmovusqd128_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qd_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusqd128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qd_256 : + GCCBuiltin<"__builtin_ia32_pmovqd256_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qd_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovqd256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qd_256 : + GCCBuiltin<"__builtin_ia32_pmovsqd256_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qd_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsqd256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qd_256 : + GCCBuiltin<"__builtin_ia32_pmovusqd256_mask">, + Intrinsic<[llvm_v4i32_ty], + [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qd_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusqd256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_qd_512 : + GCCBuiltin<"__builtin_ia32_pmovqd512_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_qd_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovqd512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_qd_512 : + GCCBuiltin<"__builtin_ia32_pmovsqd512_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_qd_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsqd512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_qd_512 : + GCCBuiltin<"__builtin_ia32_pmovusqd512_mask">, + Intrinsic<[llvm_v8i32_ty], + [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_qd_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusqd512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_db_128 : + GCCBuiltin<"__builtin_ia32_pmovdb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_db_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovdb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_db_128 : + GCCBuiltin<"__builtin_ia32_pmovsdb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_db_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsdb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_db_128 : + GCCBuiltin<"__builtin_ia32_pmovusdb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_db_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusdb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_db_256 : + GCCBuiltin<"__builtin_ia32_pmovdb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_db_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovdb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_db_256 : + GCCBuiltin<"__builtin_ia32_pmovsdb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_db_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsdb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_db_256 : + GCCBuiltin<"__builtin_ia32_pmovusdb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_db_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusdb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_db_512 : + GCCBuiltin<"__builtin_ia32_pmovdb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_db_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovdb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_db_512 : + GCCBuiltin<"__builtin_ia32_pmovsdb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_db_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsdb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_db_512 : + GCCBuiltin<"__builtin_ia32_pmovusdb512_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_db_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusdb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_dw_128 : + GCCBuiltin<"__builtin_ia32_pmovdw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_dw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovdw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_dw_128 : + GCCBuiltin<"__builtin_ia32_pmovsdw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_dw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovsdw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_dw_128 : + GCCBuiltin<"__builtin_ia32_pmovusdw128_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_dw_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovusdw128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_dw_256 : + GCCBuiltin<"__builtin_ia32_pmovdw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_dw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovdw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_dw_256 : + GCCBuiltin<"__builtin_ia32_pmovsdw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_dw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovsdw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_dw_256 : + GCCBuiltin<"__builtin_ia32_pmovusdw256_mask">, + Intrinsic<[llvm_v8i16_ty], + [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_dw_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovusdw256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_dw_512 : + GCCBuiltin<"__builtin_ia32_pmovdw512_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_dw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovdw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_dw_512 : + GCCBuiltin<"__builtin_ia32_pmovsdw512_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_dw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovsdw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_dw_512 : + GCCBuiltin<"__builtin_ia32_pmovusdw512_mask">, + Intrinsic<[llvm_v16i16_ty], + [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_dw_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovusdw512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_wb_128 : + GCCBuiltin<"__builtin_ia32_pmovwb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_wb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovwb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_wb_128 : + GCCBuiltin<"__builtin_ia32_pmovswb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_wb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovswb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_wb_128 : + GCCBuiltin<"__builtin_ia32_pmovuswb128_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_wb_mem_128 : + GCCBuiltin<"__builtin_ia32_pmovuswb128mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_wb_256 : + GCCBuiltin<"__builtin_ia32_pmovwb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_wb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovwb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_wb_256 : + GCCBuiltin<"__builtin_ia32_pmovswb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_wb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovswb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_wb_256 : + GCCBuiltin<"__builtin_ia32_pmovuswb256_mask">, + Intrinsic<[llvm_v16i8_ty], + [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_wb_mem_256 : + GCCBuiltin<"__builtin_ia32_pmovuswb256mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmov_wb_512 : + GCCBuiltin<"__builtin_ia32_pmovwb512_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmov_wb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovwb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovs_wb_512 : + GCCBuiltin<"__builtin_ia32_pmovswb512_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovs_wb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovswb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; + def int_x86_avx512_mask_pmovus_wb_512 : + GCCBuiltin<"__builtin_ia32_pmovuswb512_mask">, + Intrinsic<[llvm_v32i8_ty], + [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], + [IntrNoMem]>; + def int_x86_avx512_mask_pmovus_wb_mem_512 : + GCCBuiltin<"__builtin_ia32_pmovuswb512mem_mask">, + Intrinsic<[], + [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], + [IntrReadWriteArgMem]>; +} // Misc. let TargetPrefix = "x86" in { def int_x86_avx512_mask_cmp_ps_512 : diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td index 6c7eef14715..fe5cee8d731 100644 --- a/include/llvm/Target/TargetSelectionDAG.td +++ b/include/llvm/Target/TargetSelectionDAG.td @@ -493,9 +493,10 @@ def atomic_load : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad, def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore, +// Do not use mld, mst directly. Use masked_store masked_load, masked_truncstore +def mst : SDNode<"ISD::MSTORE", SDTMaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def masked_load : SDNode<"ISD::MLOAD", SDTMaskedLoad, +def mld : SDNode<"ISD::MLOAD", SDTMaskedLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -680,6 +681,12 @@ def load : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; +// masked load fragments. +def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (mld node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + // extending load fragments. def extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ return cast(N)->getExtensionType() == ISD::EXTLOAD; @@ -791,6 +798,12 @@ def store : PatFrag<(ops node:$val, node:$ptr), return !cast(N)->isTruncatingStore(); }]>; +// masked store fragments. +def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (mst node:$src1, node:$src2, node:$src3), [{ + return !cast(N)->isTruncatingStore(); +}]>; + // truncstore fragments. def truncstore : PatFrag<(ops node:$val, node:$ptr), (unindexedstore node:$val, node:$ptr), [{ @@ -817,6 +830,21 @@ def truncstoref64 : PatFrag<(ops node:$val, node:$ptr), return cast(N)->getMemoryVT() == MVT::f64; }]>; +def truncstorevi8 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def truncstorevi16 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def truncstorevi32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + // indexed store fragments. def istore : PatFrag<(ops node:$val, node:$base, node:$offset), (ist node:$val, node:$base, node:$offset), [{ @@ -891,6 +919,27 @@ def post_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset), return cast(N)->getMemoryVT() == MVT::f32; }]>; +// masked truncstore fragments +def masked_truncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (mst node:$src1, node:$src2, node:$src3), [{ + return cast(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_truncstore node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_truncstore node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_truncstore node:$src1, node:$src2, node:$src3), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + // setcc convenience fragments. def setoeq : PatFrag<(ops node:$lhs, node:$rhs), (setcc node:$lhs, node:$rhs, SETOEQ)>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dc73cb2392f..16aa96e8663 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1348,6 +1348,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); @@ -1556,6 +1574,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1566,6 +1585,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -12485,10 +12509,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { Subtarget->hasDQI() && Subtarget->hasVLX()) return Op; // legal, will go to VPMOVB2M, VPMOVQ2M } - if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { - if (VT.getVectorElementType().getSizeInBits() >=8) - return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + if (VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); unsigned NumElts = InVT.getVectorNumElements(); assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); @@ -12504,6 +12526,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); } + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (((!InVT.is512BitVector() && Subtarget->hasVLX()) || InVT.is512BitVector()) && + (InVT.getVectorElementType() != MVT::i16 || Subtarget->hasBWI())) + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { @@ -15220,7 +15247,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, /// \brief Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting for \p Mask when lowering masking intrinsics. +/// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, @@ -15228,8 +15255,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + SDValue VMask = SDValue(); + unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); assert(MaskVT.isSimple() && "invalid mask type"); @@ -15237,11 +15264,20 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, if (isAllOnes(Mask)) return Op; - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + if (MaskVT.bitsGT(Mask.getValueType())) { + EVT newMaskVT = EVT::getIntegerVT(*DAG.getContext(), + MaskVT.getSizeInBits()); + VMask = DAG.getBitcast(MaskVT, + DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask)); + } else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } switch (Op.getOpcode()) { default: break; @@ -15250,10 +15286,18 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, case X86ISD::CMPM: case X86ISD::CMPMU: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; } if (PreservedSrc.getOpcode() == ISD::UNDEF) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. @@ -16111,6 +16155,45 @@ static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, return Chain; } +/// \brief Lower intrinsics for TRUNCATE_TO_MEM case +/// return truncate Store/MaskedStore Node +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + SelectionDAG &DAG, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + EVT VT = DataToTruncate.getValueType(); + EVT SVT = EVT::getVectorVT(*DAG.getContext(), + ElementType, VT.getVectorNumElements()); + + if (isAllOnes(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); + + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), + MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); + + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); +} + static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); @@ -16244,6 +16327,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); } + case TRUNCATE_TO_MEM_VI8: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); + case TRUNCATE_TO_MEM_VI16: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); + case TRUNCATE_TO_MEM_VI32: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); @@ -18954,7 +19043,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; @@ -24093,6 +24183,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); @@ -24204,6 +24303,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7fab1938682..3d058e8da13 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -282,9 +282,8 @@ namespace llvm { // Vector integer truncate. VTRUNC, - - // Vector integer truncate with mask. - VTRUNCM, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, VTRUNCS, // Vector FP extend. VFPEXT, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 215dcebe661..5fd38ddb902 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5571,82 +5571,217 @@ defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W // Integer truncate and extend operations //------------------------------------------------- -multiclass avx512_trunc_sat opc, string OpcodeStr, - RegisterClass dstRC, RegisterClass srcRC, - RegisterClass KRC, X86MemOperand x86memop> { - def rr : AVX512XS8I opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, + X86MemOperand x86memop> { + + defm rr : AVX512_maskable, + EVEX, T8XS; + + // for intrinsic patter match + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + undef)), + (!cast(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.ImmAllZerosV)), + (!cast(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.RC:$src0)), + (!cast(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, + DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + let mayStore = 1 in { + def mr : AVX512XS8I, EVEX; - def rrk : AVX512XS8I, EVEX, EVEX_K; - - def rrkz : AVX512XS8I, EVEX, EVEX_KZ; - - def mr : AVX512XS8I, EVEX; - - def mrk : AVX512XS8I, EVEX, EVEX_K; - + }//mayStore = 1 } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>; -def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>; -def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>; -def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; -def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; +multiclass avx512_trunc_mr_lowering { -def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>; -def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast(NAME#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast(NAME#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc_sat_mr_lowering { + + def: Pat<(!cast("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), + (!cast(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, + (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), + (SrcInfo.VT SrcInfo.RC:$src))>; + + def: Pat<(!cast("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), + (!cast(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, + (SrcInfo.VT SrcInfo.RC:$src))>; +} + +multiclass avx512_trunc opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, + Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common, + avx512_trunc_mr_lowering, EVEX_V128; + + defm Z256: avx512_trunc_common, + avx512_trunc_mr_lowering, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common, + avx512_trunc_mr_lowering, EVEX_V512; +} + +multiclass avx512_trunc_sat opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common, + avx512_trunc_sat_mr_lowering, EVEX_V128; + + defm Z256: avx512_trunc_common, + avx512_trunc_sat_mr_lowering, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common, + avx512_trunc_sat_mr_lowering, EVEX_V512; +} + +multiclass avx512_trunc_qb opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<8, CD8VO>; +} +multiclass avx512_trunc_sat_qb opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<8, CD8VO>; +} + +multiclass avx512_trunc_qw opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VQ>; +} +multiclass avx512_trunc_sat_qw opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_sat_qd opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<32, CD8VH>; +} + +multiclass avx512_trunc_db opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<8, CD8VQ>; +} +multiclass avx512_trunc_sat_db opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_dw opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<16, CD8VH>; +} + +multiclass avx512_trunc_wb opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_wb opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; +defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; +defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; +defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; +defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; +defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; +defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; +defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; +defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; +defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; +defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; +defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; +defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; multiclass avx512_extend_common opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 401b3267368..bf999dce047 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -114,19 +114,17 @@ def X86vsext : SDNode<"X86ISD::VSEXT", SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>]>>; -def X86vtrunc : SDNode<"X86ISD::VTRUNC", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<0, 1>]>>; +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + def X86trunc : SDNode<"X86ISD::TRUNC", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>>; - -def X86vtruncm : SDNode<"X86ISD::VTRUNCM", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisVec<2>, SDTCisInt<2>, - SDTCisOpSmallerThanOp<0, 2>]>>; def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 1383fa37306..a8ad8deec5d 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -21,10 +21,12 @@ enum IntrinsicType { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, + INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, + TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, BLEND }; @@ -138,6 +140,42 @@ static const IntrinsicData IntrinsicsWithChain[] = { EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -813,6 +851,114 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-ext.ll similarity index 98% rename from test/CodeGen/X86/avx512-trunc-ext.ll rename to test/CodeGen/X86/avx512-ext.ll index f25458972e4..aa1dd4928c3 100644 --- a/test/CodeGen/X86/avx512-trunc-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1,24 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX - -; KNL-LABEL: trunc_16x32_to_16x8 -; KNL: vpmovdb -; KNL: ret -define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone { - %x = trunc <16 x i32> %i to <16 x i8> - ret <16 x i8> %x -} - -; KNL-LABEL: trunc_8x64_to_8x16 -; KNL: vpmovqw -; KNL: ret -define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone { - %x = trunc <8 x i64> %i to <8 x i16> - ret <8 x i16> %x -} - -;SKX-LABEL: zext_8x8mem_to_8x16: + ;SKX-LABEL: zext_8x8mem_to_8x16: ;SKX: ## BB#0: ;SKX-NEXT: vpmovw2m %xmm0, %k1 ;SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z} @@ -895,13 +878,6 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { ret <8 x i32> %y } -; KNL-LABEL: trunc_v16i32_to_v16i16 -; KNL: vpmovdw -; KNL: ret -define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { - %1 = trunc <16 x i32> %x to <16 x i16> - ret <16 x i16> %1 -} ; KNL-LABEL: trunc_i32_to_i1 ; KNL: movw $-4, %ax diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 6e50fda7467..7c30063ce28 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -3119,6 +3119,396 @@ define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 ret <16 x float> %res2 } +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512: +; CHECK: vpmovqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512: +; CHECK: vpmovqb %zmm0, (%rdi) +; CHECK: vpmovqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512: +; CHECK: vpmovsqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512: +; CHECK: vpmovsqb %zmm0, (%rdi) +; CHECK: vpmovsqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512: +; CHECK: vpmovusqb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512: +; CHECK: vpmovusqb %zmm0, (%rdi) +; CHECK: vpmovusqb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512: +; CHECK: vpmovqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512: +; CHECK: vpmovqw %zmm0, (%rdi) +; CHECK: vpmovqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512: +; CHECK: vpmovsqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512: +; CHECK: vpmovsqw %zmm0, (%rdi) +; CHECK: vpmovsqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512: +; CHECK: vpmovusqw %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %zmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512: +; CHECK: vpmovusqw %zmm0, (%rdi) +; CHECK: vpmovusqw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512: +; CHECK: vpmovqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512: +; CHECK: vpmovqd %zmm0, (%rdi) +; CHECK: vpmovqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: +; CHECK: vpmovsqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512: +; CHECK: vpmovsqd %zmm0, (%rdi) +; CHECK: vpmovsqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: +; CHECK: vpmovusqd %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 + %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + %res3 = add <8 x i32> %res0, %res1 + %res4 = add <8 x i32> %res3, %res2 + ret <8 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512: +; CHECK: vpmovusqd %zmm0, (%rdi) +; CHECK: vpmovusqd %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512: +; CHECK: vpmovdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512: +; CHECK: vpmovdb %zmm0, (%rdi) +; CHECK: vpmovdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512: +; CHECK: vpmovsdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512: +; CHECK: vpmovsdb %zmm0, (%rdi) +; CHECK: vpmovsdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512: +; CHECK: vpmovusdb %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %zmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512: +; CHECK: vpmovusdb %zmm0, (%rdi) +; CHECK: vpmovusdb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512: +; CHECK: vpmovdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512: +; CHECK: vpmovdw %zmm0, (%rdi) +; CHECK: vpmovdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512: +; CHECK: vpmovsdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512: +; CHECK: vpmovsdw %zmm0, (%rdi) +; CHECK: vpmovsdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + +declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512: +; CHECK: vpmovusdw %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %zmm0, %ymm0 + %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) + %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2) + %res3 = add <16 x i16> %res0, %res1 + %res4 = add <16 x i16> %res3, %res2 + ret <16 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512: +; CHECK: vpmovusdw %zmm0, (%rdi) +; CHECK: vpmovusdw %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2) + ret void +} + declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll new file mode 100644 index 00000000000..9205feda7eb --- /dev/null +++ b/test/CodeGen/X86/avx512-trunc.ll @@ -0,0 +1,364 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX + + attributes #0 = { nounwind } + +; KNL-LABEL: trunc_16x32_to_16x8 +; KNL: vpmovdb +; KNL: ret +define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 { + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +; KNL-LABEL: trunc_8x64_to_8x16 +; KNL: vpmovqw +; KNL: ret +define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 { + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +; KNL-LABEL: trunc_v16i32_to_v16i16 +; KNL: vpmovdw +; KNL: ret +define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 { + %1 = trunc <16 x i32> %x to <16 x i16> + ret <16 x i16> %1 +} + +define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qb_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + ret <2 x i8> %x +} + +define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 { +; SKX-LABEL: trunc_qb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i8> + store <2 x i8> %x, <2 x i8>* %res + ret void +} + +define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qw_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + ret <2 x i16> %x +} + +define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 { +; SKX-LABEL: trunc_qw_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i16> + store <2 x i16> %x, <2 x i16>* %res + ret void +} + +define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + ret <8 x i32> %x +} + +define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i64> %i to <8 x i32> + store <8 x i32> %x, <8 x i32>* %res + ret void +} + +define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + ret <4 x i32> %x +} + +define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i64> %i to <4 x i32> + store <4 x i32> %x, <4 x i32>* %res + ret void +} + +define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { +; SKX-LABEL: trunc_qd_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + ret <2 x i32> %x +} + +define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { +; SKX-LABEL: trunc_qd_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovqd %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <2 x i64> %i to <2 x i32> + store <2 x i32> %x, <2 x i32>* %res + ret void +} + +define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 { +; SKX-LABEL: trunc_db_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %zmm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { +; SKX-LABEL: trunc_db_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} + +define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 { +; SKX-LABEL: trunc_db_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + ret <4 x i8> %x +} + +define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 { +; SKX-LABEL: trunc_db_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i8> + store <4 x i8> %x, <4 x i8>* %res + ret void +} + +define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + ret <16 x i16> %x +} + +define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i32> %i to <16 x i16> + store <16 x i16> %x, <16 x i16>* %res + ret void +} + +define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + ret <8 x i16> %x +} + +define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i32> %i to <8 x i16> + store <8 x i16> %x, <8 x i16>* %res + ret void +} + +define <4 x i16> @trunc_dw_128(<4 x i32> %i) #0 { +; SKX-LABEL: trunc_dw_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + ret <4 x i16> %x +} + +define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 { +; SKX-LABEL: trunc_dw_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovdw %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <4 x i32> %i to <4 x i16> + store <4 x i16> %x, <4 x i16>* %res + ret void +} + +define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_512: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %zmm0, %ymm0 +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + ret <32 x i8> %x +} + +define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_512_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <32 x i16> %i to <32 x i8> + store <32 x i8> %x, <32 x i8>* %res + ret void +} + +define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + ret <16 x i8> %x +} + +define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <16 x i16> %i to <16 x i8> + store <16 x i8> %x, <16 x i8>* %res + ret void +} + +define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 { +; SKX-LABEL: trunc_wb_128: +; SKX: ## BB#0: +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + ret <8 x i8> %x +} + +define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 { +; SKX-LABEL: trunc_wb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovwb %xmm0, (%rdi) +; SKX-NEXT: retq + %x = trunc <8 x i16> %i to <8 x i8> + store <8 x i8> %x, <8 x i8>* %res + ret void +} diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index b2b417df2f1..5ad28ab5ab5 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1008,6 +1008,84 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i ret <32 x i16> %res2 } +declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_512: +; CHECK: vpmovwb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512: +; CHECK: vpmovwb %zmm0, (%rdi) +; CHECK: vpmovwb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: +; CHECK: vpmovswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512: +; CHECK: vpmovswb %zmm0, (%rdi) +; CHECK: vpmovswb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: +; CHECK: vpmovuswb %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 + %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + %res3 = add <32 x i8> %res0, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512: +; CHECK: vpmovuswb %zmm0, (%rdi) +; CHECK: vpmovuswb %zmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2) + ret void +} + declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 2373dc089ae..ee76ae2a8a3 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -3876,6 +3876,162 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i ret <16 x i16> %res2 } +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128: +; CHECK: vpmovwb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128: +; CHECK: vpmovwb %xmm0, (%rdi) +; CHECK: vpmovwb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128: +; CHECK: vpmovswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128: +; CHECK: vpmovswb %xmm0, (%rdi) +; CHECK: vpmovswb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128: +; CHECK: vpmovuswb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128: +; CHECK: vpmovuswb %xmm0, (%rdi) +; CHECK: vpmovuswb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256: +; CHECK: vpmovwb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256: +; CHECK: vpmovwb %ymm0, (%rdi) +; CHECK: vpmovwb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: +; CHECK: vpmovswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovswb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256: +; CHECK: vpmovswb %ymm0, (%rdi) +; CHECK: vpmovswb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: +; CHECK: vpmovuswb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) + +define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256: +; CHECK: vpmovuswb %ymm0, (%rdi) +; CHECK: vpmovuswb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1) + call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2) + ret void +} + declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 46ee51f47b6..7812148de1c 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3005,6 +3005,786 @@ define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x ret <8 x float> %res2 } +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128: +; CHECK: vpmovqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128: +; CHECK: vpmovqb %xmm0, (%rdi) +; CHECK: vpmovqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128: +; CHECK: vpmovsqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128: +; CHECK: vpmovsqb %xmm0, (%rdi) +; CHECK: vpmovsqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128: +; CHECK: vpmovusqb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128: +; CHECK: vpmovusqb %xmm0, (%rdi) +; CHECK: vpmovusqb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256: +; CHECK: vpmovqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256: +; CHECK: vpmovqb %ymm0, (%rdi) +; CHECK: vpmovqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256: +; CHECK: vpmovsqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256: +; CHECK: vpmovsqb %ymm0, (%rdi) +; CHECK: vpmovsqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256: +; CHECK: vpmovusqb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256: +; CHECK: vpmovusqb %ymm0, (%rdi) +; CHECK: vpmovusqb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128: +; CHECK: vpmovqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128: +; CHECK: vpmovqw %xmm0, (%rdi) +; CHECK: vpmovqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128: +; CHECK: vpmovsqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128: +; CHECK: vpmovsqw %xmm0, (%rdi) +; CHECK: vpmovsqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128: +; CHECK: vpmovusqw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128: +; CHECK: vpmovusqw %xmm0, (%rdi) +; CHECK: vpmovusqw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256: +; CHECK: vpmovqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256: +; CHECK: vpmovqw %ymm0, (%rdi) +; CHECK: vpmovqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256: +; CHECK: vpmovsqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256: +; CHECK: vpmovsqw %ymm0, (%rdi) +; CHECK: vpmovsqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256: +; CHECK: vpmovusqw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256: +; CHECK: vpmovusqw %ymm0, (%rdi) +; CHECK: vpmovusqw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128: +; CHECK: vpmovqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128: +; CHECK: vpmovqd %xmm0, (%rdi) +; CHECK: vpmovqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128: +; CHECK: vpmovsqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128: +; CHECK: vpmovsqd %xmm0, (%rdi) +; CHECK: vpmovsqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128: +; CHECK: vpmovusqd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %xmm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128: +; CHECK: vpmovusqd %xmm0, (%rdi) +; CHECK: vpmovusqd %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256: +; CHECK: vpmovqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256: +; CHECK: vpmovqd %ymm0, (%rdi) +; CHECK: vpmovqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: +; CHECK: vpmovsqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256: +; CHECK: vpmovsqd %ymm0, (%rdi) +; CHECK: vpmovsqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: +; CHECK: vpmovusqd %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 + %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res3 = add <4 x i32> %res0, %res1 + %res4 = add <4 x i32> %res3, %res2 + ret <4 x i32> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8) + +define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256: +; CHECK: vpmovusqd %ymm0, (%rdi) +; CHECK: vpmovusqd %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128: +; CHECK: vpmovdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128: +; CHECK: vpmovdb %xmm0, (%rdi) +; CHECK: vpmovdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128: +; CHECK: vpmovsdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128: +; CHECK: vpmovsdb %xmm0, (%rdi) +; CHECK: vpmovsdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128: +; CHECK: vpmovusdb %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %xmm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128: +; CHECK: vpmovusdb %xmm0, (%rdi) +; CHECK: vpmovusdb %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256: +; CHECK: vpmovdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256: +; CHECK: vpmovdb %ymm0, (%rdi) +; CHECK: vpmovdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256: +; CHECK: vpmovsdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256: +; CHECK: vpmovsdb %ymm0, (%rdi) +; CHECK: vpmovsdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8) + +define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256: +; CHECK: vpmovusdb %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 + %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2) + %res3 = add <16 x i8> %res0, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256: +; CHECK: vpmovusdb %ymm0, (%rdi) +; CHECK: vpmovusdb %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128: +; CHECK: vpmovdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128: +; CHECK: vpmovdw %xmm0, (%rdi) +; CHECK: vpmovdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128: +; CHECK: vpmovsdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128: +; CHECK: vpmovsdw %xmm0, (%rdi) +; CHECK: vpmovsdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128: +; CHECK: vpmovusdw %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %xmm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128: +; CHECK: vpmovusdw %xmm0, (%rdi) +; CHECK: vpmovusdw %xmm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256: +; CHECK: vpmovdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256: +; CHECK: vpmovdw %ymm0, (%rdi) +; CHECK: vpmovdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256: +; CHECK: vpmovsdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovsdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256: +; CHECK: vpmovsdw %ymm0, (%rdi) +; CHECK: vpmovsdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + +declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256: +; CHECK: vpmovusdw %ymm0, %xmm1 {%k1} +; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vpmovusdw %ymm0, %xmm0 + %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) + %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2) + %res3 = add <8 x i16> %res0, %res1 + %res4 = add <8 x i16> %res3, %res2 + ret <8 x i16> %res4 +} + +declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8) + +define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256: +; CHECK: vpmovusdw %ymm0, (%rdi) +; CHECK: vpmovusdw %ymm0, (%rdi) {%k1} + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1) + call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2) + ret void +} + declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index 6c16e634a59..f51d4fa103e 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -190,10 +190,13 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { ; AVX2-LABEL: test15 ; AVX2: vpmaskmovd -; SKX-LABEL: test15 -; SKX: kshiftl -; SKX: kshiftr -; SKX: vmovdqu32 {{.*}}{%k1} +; SKX-LABEL: test15: +; SKX: ## BB#0: +; SKX-NEXT: vpandq {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1 +; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; SKX-NEXT: retq define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) diff --git a/test/MC/X86/x86-64-avx512bw.s b/test/MC/X86/x86-64-avx512bw.s index 8d72b086a6c..9e10ca42c3b 100644 --- a/test/MC/X86/x86-64-avx512bw.s +++ b/test/MC/X86/x86-64-avx512bw.s @@ -3668,6 +3668,126 @@ // CHECK: encoding: [0x62,0x62,0x7d,0x48,0x1d,0xb2,0xc0,0xdf,0xff,0xff] vpabsw -8256(%rdx), %zmm30 +// CHECK: vpmovwb %zmm27, %ymm22 +// CHECK: encoding: [0x62,0x22,0x7e,0x48,0x30,0xde] + vpmovwb %zmm27, %ymm22 + +// CHECK: vpmovwb %zmm27, %ymm22 {%k1} +// CHECK: encoding: [0x62,0x22,0x7e,0x49,0x30,0xde] + vpmovwb %zmm27, %ymm22 {%k1} + +// CHECK: vpmovwb %zmm27, %ymm22 {%k1} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0xc9,0x30,0xde] + vpmovwb %zmm27, %ymm22 {%k1} {z} + +// CHECK: vpmovwb %zmm22, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0x31] + vpmovwb %zmm22, (%rcx) + +// CHECK: vpmovwb %zmm22, (%rcx) {%k4} +// CHECK: encoding: [0x62,0xe2,0x7e,0x4c,0x30,0x31] + vpmovwb %zmm22, (%rcx) {%k4} + +// CHECK: vpmovwb %zmm22, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x30,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovwb %zmm22, 291(%rax,%r14,8) + +// CHECK: vpmovwb %zmm22, 4064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0x72,0x7f] + vpmovwb %zmm22, 4064(%rdx) + +// CHECK: vpmovwb %zmm22, 4096(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0xb2,0x00,0x10,0x00,0x00] + vpmovwb %zmm22, 4096(%rdx) + +// CHECK: vpmovwb %zmm22, -4096(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0x72,0x80] + vpmovwb %zmm22, -4096(%rdx) + +// CHECK: vpmovwb %zmm22, -4128(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x48,0x30,0xb2,0xe0,0xef,0xff,0xff] + vpmovwb %zmm22, -4128(%rdx) + +// CHECK: vpmovswb %zmm18, %ymm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x48,0x20,0xd7] + vpmovswb %zmm18, %ymm23 + +// CHECK: vpmovswb %zmm18, %ymm23 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x4a,0x20,0xd7] + vpmovswb %zmm18, %ymm23 {%k2} + +// CHECK: vpmovswb %zmm18, %ymm23 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xca,0x20,0xd7] + vpmovswb %zmm18, %ymm23 {%k2} {z} + +// CHECK: vpmovswb %zmm24, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x01] + vpmovswb %zmm24, (%rcx) + +// CHECK: vpmovswb %zmm24, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x4f,0x20,0x01] + vpmovswb %zmm24, (%rcx) {%k7} + +// CHECK: vpmovswb %zmm24, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x48,0x20,0x84,0xf0,0x23,0x01,0x00,0x00] + vpmovswb %zmm24, 291(%rax,%r14,8) + +// CHECK: vpmovswb %zmm24, 4064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x42,0x7f] + vpmovswb %zmm24, 4064(%rdx) + +// CHECK: vpmovswb %zmm24, 4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x82,0x00,0x10,0x00,0x00] + vpmovswb %zmm24, 4096(%rdx) + +// CHECK: vpmovswb %zmm24, -4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x42,0x80] + vpmovswb %zmm24, -4096(%rdx) + +// CHECK: vpmovswb %zmm24, -4128(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x20,0x82,0xe0,0xef,0xff,0xff] + vpmovswb %zmm24, -4128(%rdx) + +// CHECK: vpmovuswb %zmm22, %ymm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x48,0x10,0xf4] + vpmovuswb %zmm22, %ymm28 + +// CHECK: vpmovuswb %zmm22, %ymm28 {%k3} +// CHECK: encoding: [0x62,0x82,0x7e,0x4b,0x10,0xf4] + vpmovuswb %zmm22, %ymm28 {%k3} + +// CHECK: vpmovuswb %zmm22, %ymm28 {%k3} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xcb,0x10,0xf4] + vpmovuswb %zmm22, %ymm28 {%k3} {z} + +// CHECK: vpmovuswb %zmm27, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x19] + vpmovuswb %zmm27, (%rcx) + +// CHECK: vpmovuswb %zmm27, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x4a,0x10,0x19] + vpmovuswb %zmm27, (%rcx) {%k2} + +// CHECK: vpmovuswb %zmm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x48,0x10,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovuswb %zmm27, 291(%rax,%r14,8) + +// CHECK: vpmovuswb %zmm27, 4064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x5a,0x7f] + vpmovuswb %zmm27, 4064(%rdx) + +// CHECK: vpmovuswb %zmm27, 4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x9a,0x00,0x10,0x00,0x00] + vpmovuswb %zmm27, 4096(%rdx) + +// CHECK: vpmovuswb %zmm27, -4096(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x5a,0x80] + vpmovuswb %zmm27, -4096(%rdx) + +// CHECK: vpmovuswb %zmm27, -4128(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x48,0x10,0x9a,0xe0,0xef,0xff,0xff] + vpmovuswb %zmm27, -4128(%rdx) + // CHECK: vpmulhuw %zmm21, %zmm24, %zmm21 // CHECK: encoding: [0x62,0xa1,0x3d,0x40,0xe4,0xed] vpmulhuw %zmm21, %zmm24, %zmm21 diff --git a/test/MC/X86/x86-64-avx512bw_vl.s b/test/MC/X86/x86-64-avx512bw_vl.s index dcc9db577eb..24b8a56efe6 100644 --- a/test/MC/X86/x86-64-avx512bw_vl.s +++ b/test/MC/X86/x86-64-avx512bw_vl.s @@ -6583,6 +6583,486 @@ // CHECK: encoding: [0x62,0xe2,0x6d,0x20,0x00,0x9a,0xe0,0xef,0xff,0xff] vpshufb -4128(%rdx), %ymm18, %ymm19 +// CHECK: vpmovwb %xmm28, %xmm27 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x30,0xe3] + vpmovwb %xmm28, %xmm27 + +// CHECK: vpmovwb %xmm28, %xmm27 {%k2} +// CHECK: encoding: [0x62,0x02,0x7e,0x0a,0x30,0xe3] + vpmovwb %xmm28, %xmm27 {%k2} + +// CHECK: vpmovwb %xmm28, %xmm27 {%k2} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x8a,0x30,0xe3] + vpmovwb %xmm28, %xmm27 {%k2} {z} + +// CHECK: vpmovwb %ymm26, %xmm26 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x30,0xd2] + vpmovwb %ymm26, %xmm26 + +// CHECK: vpmovwb %ymm26, %xmm26 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x2c,0x30,0xd2] + vpmovwb %ymm26, %xmm26 {%k4} + +// CHECK: vpmovwb %ymm26, %xmm26 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xac,0x30,0xd2] + vpmovwb %ymm26, %xmm26 {%k4} {z} + +// CHECK: vpmovwb %xmm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x39] + vpmovwb %xmm23,(%rcx) + +// CHECK: vpmovwb %xmm23, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0e,0x30,0x39] + vpmovwb %xmm23,(%rcx) {%k6} + +// CHECK: vpmovwb %xmm23, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x30,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmovwb %xmm23,4660(%rax,%r14,8) + +// CHECK: vpmovwb %xmm23, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x7a,0x7f] + vpmovwb %xmm23, 1016(%rdx) + +// CHECK: vpmovwb %xmm23, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xba,0x00,0x04,0x00,0x00] + vpmovwb %xmm23, 1024(%rdx) + +// CHECK: vpmovwb %xmm23, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x7a,0x80] + vpmovwb %xmm23,-1024(%rdx) + +// CHECK: vpmovwb %xmm23, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xba,0xf8,0xfb,0xff,0xff] + vpmovwb %xmm23,-1032(%rdx) + +// CHECK: vpmovwb %ymm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x29] + vpmovwb %ymm21,(%rcx) + +// CHECK: vpmovwb %ymm21, (%rcx) {%k5} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2d,0x30,0x29] + vpmovwb %ymm21,(%rcx) {%k5} + +// CHECK: vpmovwb %ymm21, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x30,0xac,0xf0,0x34,0x12,0x00,0x00] + vpmovwb %ymm21, 4660(%rax,%r14,8) + +// CHECK: vpmovwb %ymm21, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x6a,0x7f] + vpmovwb %ymm21, 2032(%rdx) + +// CHECK: vpmovwb %ymm21, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xaa,0x00,0x08,0x00,0x00] + vpmovwb %ymm21, 2048(%rdx) + +// CHECK: vpmovwb %ymm21, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x6a,0x80] + vpmovwb %ymm21,-2048(%rdx) + +// CHECK: vpmovwb %ymm21, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xaa,0xf0,0xf7,0xff,0xff] + vpmovwb %ymm21, -2064(%rdx) + +// CHECK: vpmovswb %xmm19, %xmm17 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x20,0xd9] + vpmovswb %xmm19, %xmm17 + +// CHECK: vpmovswb %xmm19, %xmm17 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7e,0x09,0x20,0xd9] + vpmovswb %xmm19, %xmm17 {%k1} + +// CHECK: vpmovswb %xmm19, %xmm17 {%k1} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x89,0x20,0xd9] + vpmovswb %xmm19, %xmm17 {%k1} {z} + +// CHECK: vpmovswb %ymm19, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x20,0xdd] + vpmovswb %ymm19, %xmm21 + +// CHECK: vpmovswb %ymm19, %xmm21 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2c,0x20,0xdd] + vpmovswb %ymm19, %xmm21 {%k4} + +// CHECK: vpmovswb %ymm19, %xmm21 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xac,0x20,0xdd] + vpmovswb %ymm19, %xmm21 {%k4} {z} + +// CHECK: vpmovswb %xmm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x11] + vpmovswb %xmm18,(%rcx) + +// CHECK: vpmovswb %xmm18, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x20,0x11] + vpmovswb %xmm18,(%rcx) {%k2} + +// CHECK: vpmovswb %xmm18, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x20,0x94,0xf0,0x34,0x12,0x00,0x00] + vpmovswb %xmm18, 4660(%rax,%r14,8) + +// CHECK: vpmovswb %xmm18, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x52,0x7f] + vpmovswb %xmm18, 1016(%rdx) + +// CHECK: vpmovswb %xmm18, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x92,0x00,0x04,0x00,0x00] + vpmovswb %xmm18, 1024(%rdx) + +// CHECK: vpmovswb %xmm18, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x52,0x80] + vpmovswb %xmm18, -1024(%rdx) + +// CHECK: vpmovswb %xmm18, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x20,0x92,0xf8,0xfb,0xff,0xff] + vpmovswb %xmm18, -1032(%rdx) + +// CHECK: vpmovswb %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0x39] + vpmovswb %ymm23,(%rcx) + +// CHECK: vpmovswb %ymm23, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2a,0x20,0x39] + vpmovswb %ymm23,(%rcx) {%k2} + +// CHECK: vpmovswb %ymm23, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x20,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmovswb %ymm23, 4660(%rax,%r14,8) + +// CHECK: vpmovswb %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0x7a,0x7f] + vpmovswb %ymm23, 2032(%rdx) + +// CHECK: vpmovswb %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0xba,0x00,0x08,0x00,0x00] + vpmovswb %ymm23, 2048(%rdx) + +// CHECK: vpmovswb %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0x7a,0x80] + vpmovswb %ymm23, -2048(%rdx) + +// CHECK: vpmovswb %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x20,0xba,0xf0,0xf7,0xff,0xff] + vpmovswb %ymm23, -2064(%rdx) + +// CHECK: vpmovuswb %xmm17, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x10,0xca] + vpmovuswb %xmm17, %xmm26 + +// CHECK: vpmovuswb %xmm17, %xmm26 {%k6} +// CHECK: encoding: [0x62,0x82,0x7e,0x0e,0x10,0xca] + vpmovuswb %xmm17, %xmm26 {%k6} + +// CHECK: vpmovuswb %xmm17, %xmm26 {%k6} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8e,0x10,0xca] + vpmovuswb %xmm17, %xmm26 {%k6} {z} + +// CHECK: vpmovuswb %ymm26, %xmm17 +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x10,0xd1] + vpmovuswb %ymm26, %xmm17 + +// CHECK: vpmovuswb %ymm26, %xmm17 {%k2} +// CHECK: encoding: [0x62,0x22,0x7e,0x2a,0x10,0xd1] + vpmovuswb %ymm26, %xmm17 {%k2} + +// CHECK: vpmovuswb %ymm26, %xmm17 {%k2} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0xaa,0x10,0xd1] + vpmovuswb %ymm26, %xmm17 {%k2} {z} + +// CHECK: vpmovuswb %xmm19, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x19] + vpmovuswb %xmm19,(%rcx) + +// CHECK: vpmovuswb %xmm19, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x09,0x10,0x19] + vpmovuswb %xmm19,(%rcx) {%k1} + +// CHECK: vpmovuswb %xmm19, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x10,0x9c,0xf0,0x34,0x12,0x00,0x00] + vpmovuswb %xmm19, 4660(%rax,%r14,8) + +// CHECK: vpmovuswb %xmm19, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x5a,0x7f] + vpmovuswb %xmm19, 1016(%rdx) + +// CHECK: vpmovuswb %xmm19, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x9a,0x00,0x04,0x00,0x00] + vpmovuswb %xmm19, 1024(%rdx) + +// CHECK: vpmovuswb %xmm19, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x5a,0x80] + vpmovuswb %xmm19, -1024(%rdx) + +// CHECK: vpmovuswb %xmm19, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x10,0x9a,0xf8,0xfb,0xff,0xff] + vpmovuswb %xmm19, -1032(%rdx) + +// CHECK: vpmovuswb %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0x39] + vpmovuswb %ymm23,(%rcx) + +// CHECK: vpmovuswb %ymm23, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2e,0x10,0x39] + vpmovuswb %ymm23,(%rcx) {%k6} + +// CHECK: vpmovuswb %ymm23, 4660(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x10,0xbc,0xf0,0x34,0x12,0x00,0x00] + vpmovuswb %ymm23, 4660(%rax,%r14,8) + +// CHECK: vpmovuswb %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0x7a,0x7f] + vpmovuswb %ymm23, 2032(%rdx) + +// CHECK: vpmovuswb %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0xba,0x00,0x08,0x00,0x00] + vpmovuswb %ymm23, 2048(%rdx) + +// CHECK: vpmovuswb %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0x7a,0x80] + vpmovuswb %ymm23, -2048(%rdx) + +// CHECK: vpmovuswb %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x10,0xba,0xf0,0xf7,0xff,0xff] + vpmovuswb %ymm23, -2064(%rdx) + +// CHECK: vpmovwb %xmm17, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x30,0xcd] + vpmovwb %xmm17, %xmm21 + +// CHECK: vpmovwb %xmm17, %xmm21 {%k1} +// CHECK: encoding: [0x62,0xa2,0x7e,0x09,0x30,0xcd] + vpmovwb %xmm17, %xmm21 {%k1} + +// CHECK: vpmovwb %xmm17, %xmm21 {%k1} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x89,0x30,0xcd] + vpmovwb %xmm17, %xmm21 {%k1} {z} + +// CHECK: vpmovwb %ymm23, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x30,0xfa] + vpmovwb %ymm23, %xmm26 + +// CHECK: vpmovwb %ymm23, %xmm26 {%k7} +// CHECK: encoding: [0x62,0x82,0x7e,0x2f,0x30,0xfa] + vpmovwb %ymm23, %xmm26 {%k7} + +// CHECK: vpmovwb %ymm23, %xmm26 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xaf,0x30,0xfa] + vpmovwb %ymm23, %xmm26 {%k7} {z} + +// CHECK: vpmovwb %xmm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x29] + vpmovwb %xmm21, (%rcx) + +// CHECK: vpmovwb %xmm21, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x30,0x29] + vpmovwb %xmm21, (%rcx) {%k2} + +// CHECK: vpmovwb %xmm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x30,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovwb %xmm21, 291(%rax,%r14,8) + +// CHECK: vpmovwb %xmm21, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x6a,0x7f] + vpmovwb %xmm21, 1016(%rdx) + +// CHECK: vpmovwb %xmm21, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xaa,0x00,0x04,0x00,0x00] + vpmovwb %xmm21, 1024(%rdx) + +// CHECK: vpmovwb %xmm21, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0x6a,0x80] + vpmovwb %xmm21, -1024(%rdx) + +// CHECK: vpmovwb %xmm21, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x30,0xaa,0xf8,0xfb,0xff,0xff] + vpmovwb %xmm21, -1032(%rdx) + +// CHECK: vpmovwb %ymm20, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x21] + vpmovwb %ymm20, (%rcx) + +// CHECK: vpmovwb %ymm20, (%rcx) {%k4} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2c,0x30,0x21] + vpmovwb %ymm20, (%rcx) {%k4} + +// CHECK: vpmovwb %ymm20, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x30,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovwb %ymm20, 291(%rax,%r14,8) + +// CHECK: vpmovwb %ymm20, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x62,0x7f] + vpmovwb %ymm20, 2032(%rdx) + +// CHECK: vpmovwb %ymm20, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xa2,0x00,0x08,0x00,0x00] + vpmovwb %ymm20, 2048(%rdx) + +// CHECK: vpmovwb %ymm20, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0x62,0x80] + vpmovwb %ymm20, -2048(%rdx) + +// CHECK: vpmovwb %ymm20, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x30,0xa2,0xf0,0xf7,0xff,0xff] + vpmovwb %ymm20, -2064(%rdx) + +// CHECK: vpmovswb %xmm20, %xmm24 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x20,0xe0] + vpmovswb %xmm20, %xmm24 + +// CHECK: vpmovswb %xmm20, %xmm24 {%k4} +// CHECK: encoding: [0x62,0x82,0x7e,0x0c,0x20,0xe0] + vpmovswb %xmm20, %xmm24 {%k4} + +// CHECK: vpmovswb %xmm20, %xmm24 {%k4} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8c,0x20,0xe0] + vpmovswb %xmm20, %xmm24 {%k4} {z} + +// CHECK: vpmovswb %ymm18, %xmm27 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x20,0xd3] + vpmovswb %ymm18, %xmm27 + +// CHECK: vpmovswb %ymm18, %xmm27 {%k1} +// CHECK: encoding: [0x62,0x82,0x7e,0x29,0x20,0xd3] + vpmovswb %ymm18, %xmm27 {%k1} + +// CHECK: vpmovswb %ymm18, %xmm27 {%k1} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xa9,0x20,0xd3] + vpmovswb %ymm18, %xmm27 {%k1} {z} + +// CHECK: vpmovswb %xmm24, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x01] + vpmovswb %xmm24, (%rcx) + +// CHECK: vpmovswb %xmm24, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x20,0x01] + vpmovswb %xmm24, (%rcx) {%k3} + +// CHECK: vpmovswb %xmm24, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x20,0x84,0xf0,0x23,0x01,0x00,0x00] + vpmovswb %xmm24, 291(%rax,%r14,8) + +// CHECK: vpmovswb %xmm24, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x42,0x7f] + vpmovswb %xmm24, 1016(%rdx) + +// CHECK: vpmovswb %xmm24, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x82,0x00,0x04,0x00,0x00] + vpmovswb %xmm24, 1024(%rdx) + +// CHECK: vpmovswb %xmm24, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x42,0x80] + vpmovswb %xmm24, -1024(%rdx) + +// CHECK: vpmovswb %xmm24, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x20,0x82,0xf8,0xfb,0xff,0xff] + vpmovswb %xmm24, -1032(%rdx) + +// CHECK: vpmovswb %ymm27, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x19] + vpmovswb %ymm27, (%rcx) + +// CHECK: vpmovswb %ymm27, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x2f,0x20,0x19] + vpmovswb %ymm27, (%rcx) {%k7} + +// CHECK: vpmovswb %ymm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x20,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovswb %ymm27, 291(%rax,%r14,8) + +// CHECK: vpmovswb %ymm27, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x5a,0x7f] + vpmovswb %ymm27, 2032(%rdx) + +// CHECK: vpmovswb %ymm27, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x9a,0x00,0x08,0x00,0x00] + vpmovswb %ymm27, 2048(%rdx) + +// CHECK: vpmovswb %ymm27, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x5a,0x80] + vpmovswb %ymm27, -2048(%rdx) + +// CHECK: vpmovswb %ymm27, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x20,0x9a,0xf0,0xf7,0xff,0xff] + vpmovswb %ymm27, -2064(%rdx) + +// CHECK: vpmovuswb %xmm19, %xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x10,0xdf] + vpmovuswb %xmm19, %xmm23 + +// CHECK: vpmovuswb %xmm19, %xmm23 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0c,0x10,0xdf] + vpmovuswb %xmm19, %xmm23 {%k4} + +// CHECK: vpmovuswb %xmm19, %xmm23 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8c,0x10,0xdf] + vpmovuswb %xmm19, %xmm23 {%k4} {z} + +// CHECK: vpmovuswb %ymm23, %xmm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x10,0xfc] + vpmovuswb %ymm23, %xmm28 + +// CHECK: vpmovuswb %ymm23, %xmm28 {%k6} +// CHECK: encoding: [0x62,0x82,0x7e,0x2e,0x10,0xfc] + vpmovuswb %ymm23, %xmm28 {%k6} + +// CHECK: vpmovuswb %ymm23, %xmm28 {%k6} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xae,0x10,0xfc] + vpmovuswb %ymm23, %xmm28 {%k6} {z} + +// CHECK: vpmovuswb %xmm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x09] + vpmovuswb %xmm25, (%rcx) + +// CHECK: vpmovuswb %xmm25, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x10,0x09] + vpmovuswb %xmm25, (%rcx) {%k3} + +// CHECK: vpmovuswb %xmm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x10,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovuswb %xmm25, 291(%rax,%r14,8) + +// CHECK: vpmovuswb %xmm25, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x4a,0x7f] + vpmovuswb %xmm25, 1016(%rdx) + +// CHECK: vpmovuswb %xmm25, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x8a,0x00,0x04,0x00,0x00] + vpmovuswb %xmm25, 1024(%rdx) + +// CHECK: vpmovuswb %xmm25, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x4a,0x80] + vpmovuswb %xmm25, -1024(%rdx) + +// CHECK: vpmovuswb %xmm25, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x10,0x8a,0xf8,0xfb,0xff,0xff] + vpmovuswb %xmm25, -1032(%rdx) + +// CHECK: vpmovuswb %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0x21] + vpmovuswb %ymm28, (%rcx) + +// CHECK: vpmovuswb %ymm28, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x2a,0x10,0x21] + vpmovuswb %ymm28, (%rcx) {%k2} + +// CHECK: vpmovuswb %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x10,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovuswb %ymm28, 291(%rax,%r14,8) + +// CHECK: vpmovuswb %ymm28, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0x62,0x7f] + vpmovuswb %ymm28, 2032(%rdx) + +// CHECK: vpmovuswb %ymm28, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0xa2,0x00,0x08,0x00,0x00] + vpmovuswb %ymm28, 2048(%rdx) + +// CHECK: vpmovuswb %ymm28, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0x62,0x80] + vpmovuswb %ymm28, -2048(%rdx) + +// CHECK: vpmovuswb %ymm28, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x10,0xa2,0xf0,0xf7,0xff,0xff] + vpmovuswb %ymm28, -2064(%rdx) + // CHECK: vpmulhuw %xmm18, %xmm21, %xmm24 // CHECK: encoding: [0x62,0x21,0x55,0x00,0xe4,0xc2] vpmulhuw %xmm18, %xmm21, %xmm24 diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s index eca2ffbfc09..9280be94716 100644 --- a/test/MC/X86/x86-64-avx512f_vl.s +++ b/test/MC/X86/x86-64-avx512f_vl.s @@ -16285,6 +16285,1206 @@ vaddpd {rz-sae}, %zmm2, %zmm1, %zmm1 // CHECK: encoding: [0x62,0x62,0x4d,0x30,0x2c,0x8a,0xfc,0xfd,0xff,0xff] vscalefps -516(%rdx){1to8}, %ymm22, %ymm25 +// CHECK: vpmovqb %xmm29, %xmm24 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x32,0xe8] + vpmovqb %xmm29, %xmm24 + +// CHECK: vpmovqb %xmm29, %xmm24 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x0c,0x32,0xe8] + vpmovqb %xmm29, %xmm24 {%k4} + +// CHECK: vpmovqb %xmm29, %xmm24 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x8c,0x32,0xe8] + vpmovqb %xmm29, %xmm24 {%k4} {z} + +// CHECK: vpmovqb %ymm29, %xmm17 +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x32,0xe9] + vpmovqb %ymm29, %xmm17 + +// CHECK: vpmovqb %ymm29, %xmm17 {%k3} +// CHECK: encoding: [0x62,0x22,0x7e,0x2b,0x32,0xe9] + vpmovqb %ymm29, %xmm17 {%k3} + +// CHECK: vpmovqb %ymm29, %xmm17 {%k3} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0xab,0x32,0xe9] + vpmovqb %ymm29, %xmm17 {%k3} {z} + +// CHECK: vpmovqb %xmm27, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x19] + vpmovqb %xmm27, (%rcx) + +// CHECK: vpmovqb %xmm27, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x0a,0x32,0x19] + vpmovqb %xmm27, (%rcx) {%k2} + +// CHECK: vpmovqb %xmm27, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x32,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovqb %xmm27, 291(%rax,%r14,8) + +// CHECK: vpmovqb %xmm27, 254(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x5a,0x7f] + vpmovqb %xmm27, 254(%rdx) + +// CHECK: vpmovqb %xmm27, 256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x9a,0x00,0x01,0x00,0x00] + vpmovqb %xmm27, 256(%rdx) + +// CHECK: vpmovqb %xmm27, -256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x5a,0x80] + vpmovqb %xmm27, -256(%rdx) + +// CHECK: vpmovqb %xmm27, -258(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x32,0x9a,0xfe,0xfe,0xff,0xff] + vpmovqb %xmm27, -258(%rdx) + +// CHECK: vpmovqb %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0x21] + vpmovqb %ymm28, (%rcx) + +// CHECK: vpmovqb %ymm28, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x2f,0x32,0x21] + vpmovqb %ymm28, (%rcx) {%k7} + +// CHECK: vpmovqb %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x32,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovqb %ymm28, 291(%rax,%r14,8) + +// CHECK: vpmovqb %ymm28, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0x62,0x7f] + vpmovqb %ymm28, 508(%rdx) + +// CHECK: vpmovqb %ymm28, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0xa2,0x00,0x02,0x00,0x00] + vpmovqb %ymm28, 512(%rdx) + +// CHECK: vpmovqb %ymm28, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0x62,0x80] + vpmovqb %ymm28, -512(%rdx) + +// CHECK: vpmovqb %ymm28, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x32,0xa2,0xfc,0xfd,0xff,0xff] + vpmovqb %ymm28, -516(%rdx) + +// CHECK: vpmovsqb %xmm19, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x22,0xda] + vpmovsqb %xmm19, %xmm26 + +// CHECK: vpmovsqb %xmm19, %xmm26 {%k1} +// CHECK: encoding: [0x62,0x82,0x7e,0x09,0x22,0xda] + vpmovsqb %xmm19, %xmm26 {%k1} + +// CHECK: vpmovsqb %xmm19, %xmm26 {%k1} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x89,0x22,0xda] + vpmovsqb %xmm19, %xmm26 {%k1} {z} + +// CHECK: vpmovsqb %ymm20, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x22,0xe4] + vpmovsqb %ymm20, %xmm20 + +// CHECK: vpmovsqb %ymm20, %xmm20 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2e,0x22,0xe4] + vpmovsqb %ymm20, %xmm20 {%k6} + +// CHECK: vpmovsqb %ymm20, %xmm20 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xae,0x22,0xe4] + vpmovsqb %ymm20, %xmm20 {%k6} {z} + +// CHECK: vpmovsqb %xmm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x09] + vpmovsqb %xmm25, (%rcx) + +// CHECK: vpmovsqb %xmm25, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x0f,0x22,0x09] + vpmovsqb %xmm25, (%rcx) {%k7} + +// CHECK: vpmovsqb %xmm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x22,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsqb %xmm25, 291(%rax,%r14,8) + +// CHECK: vpmovsqb %xmm25, 254(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x4a,0x7f] + vpmovsqb %xmm25, 254(%rdx) + +// CHECK: vpmovsqb %xmm25, 256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x8a,0x00,0x01,0x00,0x00] + vpmovsqb %xmm25, 256(%rdx) + +// CHECK: vpmovsqb %xmm25, -256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x4a,0x80] + vpmovsqb %xmm25, -256(%rdx) + +// CHECK: vpmovsqb %xmm25, -258(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x22,0x8a,0xfe,0xfe,0xff,0xff] + vpmovsqb %xmm25, -258(%rdx) + +// CHECK: vpmovsqb %ymm17, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x09] + vpmovsqb %ymm17, (%rcx) + +// CHECK: vpmovsqb %ymm17, (%rcx) {%k4} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2c,0x22,0x09] + vpmovsqb %ymm17, (%rcx) {%k4} + +// CHECK: vpmovsqb %ymm17, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x22,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsqb %ymm17, 291(%rax,%r14,8) + +// CHECK: vpmovsqb %ymm17, 508(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x4a,0x7f] + vpmovsqb %ymm17, 508(%rdx) + +// CHECK: vpmovsqb %ymm17, 512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x8a,0x00,0x02,0x00,0x00] + vpmovsqb %ymm17, 512(%rdx) + +// CHECK: vpmovsqb %ymm17, -512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x4a,0x80] + vpmovsqb %ymm17, -512(%rdx) + +// CHECK: vpmovsqb %ymm17, -516(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x22,0x8a,0xfc,0xfd,0xff,0xff] + vpmovsqb %ymm17, -516(%rdx) + +// CHECK: vpmovusqb %xmm22, %xmm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x12,0xf4] + vpmovusqb %xmm22, %xmm28 + +// CHECK: vpmovusqb %xmm22, %xmm28 {%k2} +// CHECK: encoding: [0x62,0x82,0x7e,0x0a,0x12,0xf4] + vpmovusqb %xmm22, %xmm28 {%k2} + +// CHECK: vpmovusqb %xmm22, %xmm28 {%k2} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8a,0x12,0xf4] + vpmovusqb %xmm22, %xmm28 {%k2} {z} + +// CHECK: vpmovusqb %ymm23, %xmm22 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x12,0xfe] + vpmovusqb %ymm23, %xmm22 + +// CHECK: vpmovusqb %ymm23, %xmm22 {%k7} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2f,0x12,0xfe] + vpmovusqb %ymm23, %xmm22 {%k7} + +// CHECK: vpmovusqb %ymm23, %xmm22 {%k7} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaf,0x12,0xfe] + vpmovusqb %ymm23, %xmm22 {%k7} {z} + +// CHECK: vpmovusqb %xmm26, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x11] + vpmovusqb %xmm26, (%rcx) + +// CHECK: vpmovusqb %xmm26, (%rcx) {%k5} +// CHECK: encoding: [0x62,0x62,0x7e,0x0d,0x12,0x11] + vpmovusqb %xmm26, (%rcx) {%k5} + +// CHECK: vpmovusqb %xmm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x12,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqb %xmm26, 291(%rax,%r14,8) + +// CHECK: vpmovusqb %xmm26, 254(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x52,0x7f] + vpmovusqb %xmm26, 254(%rdx) + +// CHECK: vpmovusqb %xmm26, 256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x92,0x00,0x01,0x00,0x00] + vpmovusqb %xmm26, 256(%rdx) + +// CHECK: vpmovusqb %xmm26, -256(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x52,0x80] + vpmovusqb %xmm26, -256(%rdx) + +// CHECK: vpmovusqb %xmm26, -258(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x12,0x92,0xfe,0xfe,0xff,0xff] + vpmovusqb %xmm26, -258(%rdx) + +// CHECK: vpmovusqb %ymm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0x31] + vpmovusqb %ymm30, (%rcx) + +// CHECK: vpmovusqb %ymm30, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x2a,0x12,0x31] + vpmovusqb %ymm30, (%rcx) {%k2} + +// CHECK: vpmovusqb %ymm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x12,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovusqb %ymm30, 291(%rax,%r14,8) + +// CHECK: vpmovusqb %ymm30, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0x72,0x7f] + vpmovusqb %ymm30, 508(%rdx) + +// CHECK: vpmovusqb %ymm30, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0xb2,0x00,0x02,0x00,0x00] + vpmovusqb %ymm30, 512(%rdx) + +// CHECK: vpmovusqb %ymm30, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0x72,0x80] + vpmovusqb %ymm30, -512(%rdx) + +// CHECK: vpmovusqb %ymm30, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x12,0xb2,0xfc,0xfd,0xff,0xff] + vpmovusqb %ymm30, -516(%rdx) + +// CHECK: vpmovqw %xmm18, %xmm19 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x34,0xd3] + vpmovqw %xmm18, %xmm19 + +// CHECK: vpmovqw %xmm18, %xmm19 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0c,0x34,0xd3] + vpmovqw %xmm18, %xmm19 {%k4} + +// CHECK: vpmovqw %xmm18, %xmm19 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8c,0x34,0xd3] + vpmovqw %xmm18, %xmm19 {%k4} {z} + +// CHECK: vpmovqw %ymm22, %xmm19 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x34,0xf3] + vpmovqw %ymm22, %xmm19 + +// CHECK: vpmovqw %ymm22, %xmm19 {%k5} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2d,0x34,0xf3] + vpmovqw %ymm22, %xmm19 {%k5} + +// CHECK: vpmovqw %ymm22, %xmm19 {%k5} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xad,0x34,0xf3] + vpmovqw %ymm22, %xmm19 {%k5} {z} + +// CHECK: vpmovqw %xmm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0x29] + vpmovqw %xmm21, (%rcx) + +// CHECK: vpmovqw %xmm21, (%rcx) {%k3} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0b,0x34,0x29] + vpmovqw %xmm21, (%rcx) {%k3} + +// CHECK: vpmovqw %xmm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x34,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovqw %xmm21, 291(%rax,%r14,8) + +// CHECK: vpmovqw %xmm21, 508(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0x6a,0x7f] + vpmovqw %xmm21, 508(%rdx) + +// CHECK: vpmovqw %xmm21, 512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0xaa,0x00,0x02,0x00,0x00] + vpmovqw %xmm21, 512(%rdx) + +// CHECK: vpmovqw %xmm21, -512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0x6a,0x80] + vpmovqw %xmm21, -512(%rdx) + +// CHECK: vpmovqw %xmm21, -516(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x34,0xaa,0xfc,0xfd,0xff,0xff] + vpmovqw %xmm21, -516(%rdx) + +// CHECK: vpmovqw %ymm28, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0x21] + vpmovqw %ymm28, (%rcx) + +// CHECK: vpmovqw %ymm28, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x34,0x21] + vpmovqw %ymm28, (%rcx) {%k6} + +// CHECK: vpmovqw %ymm28, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x34,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovqw %ymm28, 291(%rax,%r14,8) + +// CHECK: vpmovqw %ymm28, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0x62,0x7f] + vpmovqw %ymm28, 1016(%rdx) + +// CHECK: vpmovqw %ymm28, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0xa2,0x00,0x04,0x00,0x00] + vpmovqw %ymm28, 1024(%rdx) + +// CHECK: vpmovqw %ymm28, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0x62,0x80] + vpmovqw %ymm28, -1024(%rdx) + +// CHECK: vpmovqw %ymm28, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x34,0xa2,0xf8,0xfb,0xff,0xff] + vpmovqw %ymm28, -1032(%rdx) + +// CHECK: vpmovsqw %xmm18, %xmm26 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x24,0xd2] + vpmovsqw %xmm18, %xmm26 + +// CHECK: vpmovsqw %xmm18, %xmm26 {%k7} +// CHECK: encoding: [0x62,0x82,0x7e,0x0f,0x24,0xd2] + vpmovsqw %xmm18, %xmm26 {%k7} + +// CHECK: vpmovsqw %xmm18, %xmm26 {%k7} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8f,0x24,0xd2] + vpmovsqw %xmm18, %xmm26 {%k7} {z} + +// CHECK: vpmovsqw %ymm20, %xmm28 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x24,0xe4] + vpmovsqw %ymm20, %xmm28 + +// CHECK: vpmovsqw %ymm20, %xmm28 {%k4} +// CHECK: encoding: [0x62,0x82,0x7e,0x2c,0x24,0xe4] + vpmovsqw %ymm20, %xmm28 {%k4} + +// CHECK: vpmovsqw %ymm20, %xmm28 {%k4} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xac,0x24,0xe4] + vpmovsqw %ymm20, %xmm28 {%k4} {z} + +// CHECK: vpmovsqw %xmm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0x31] + vpmovsqw %xmm30, (%rcx) + +// CHECK: vpmovsqw %xmm30, (%rcx) {%k4} +// CHECK: encoding: [0x62,0x62,0x7e,0x0c,0x24,0x31] + vpmovsqw %xmm30, (%rcx) {%k4} + +// CHECK: vpmovsqw %xmm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x24,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovsqw %xmm30, 291(%rax,%r14,8) + +// CHECK: vpmovsqw %xmm30, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0x72,0x7f] + vpmovsqw %xmm30, 508(%rdx) + +// CHECK: vpmovsqw %xmm30, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0xb2,0x00,0x02,0x00,0x00] + vpmovsqw %xmm30, 512(%rdx) + +// CHECK: vpmovsqw %xmm30, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0x72,0x80] + vpmovsqw %xmm30, -512(%rdx) + +// CHECK: vpmovsqw %xmm30, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x24,0xb2,0xfc,0xfd,0xff,0xff] + vpmovsqw %xmm30, -516(%rdx) + +// CHECK: vpmovsqw %ymm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0x29] + vpmovsqw %ymm21, (%rcx) + +// CHECK: vpmovsqw %ymm21, (%rcx) {%k5} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2d,0x24,0x29] + vpmovsqw %ymm21, (%rcx) {%k5} + +// CHECK: vpmovsqw %ymm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x24,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovsqw %ymm21, 291(%rax,%r14,8) + +// CHECK: vpmovsqw %ymm21, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0x6a,0x7f] + vpmovsqw %ymm21, 1016(%rdx) + +// CHECK: vpmovsqw %ymm21, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0xaa,0x00,0x04,0x00,0x00] + vpmovsqw %ymm21, 1024(%rdx) + +// CHECK: vpmovsqw %ymm21, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0x6a,0x80] + vpmovsqw %ymm21, -1024(%rdx) + +// CHECK: vpmovsqw %ymm21, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x24,0xaa,0xf8,0xfb,0xff,0xff] + vpmovsqw %ymm21, -1032(%rdx) + +// CHECK: vpmovusqw %xmm20, %xmm29 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x14,0xe5] + vpmovusqw %xmm20, %xmm29 + +// CHECK: vpmovusqw %xmm20, %xmm29 {%k1} +// CHECK: encoding: [0x62,0x82,0x7e,0x09,0x14,0xe5] + vpmovusqw %xmm20, %xmm29 {%k1} + +// CHECK: vpmovusqw %xmm20, %xmm29 {%k1} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x89,0x14,0xe5] + vpmovusqw %xmm20, %xmm29 {%k1} {z} + +// CHECK: vpmovusqw %ymm21, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x14,0xec] + vpmovusqw %ymm21, %xmm20 + +// CHECK: vpmovusqw %ymm21, %xmm20 {%k5} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2d,0x14,0xec] + vpmovusqw %ymm21, %xmm20 {%k5} + +// CHECK: vpmovusqw %ymm21, %xmm20 {%k5} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xad,0x14,0xec] + vpmovusqw %ymm21, %xmm20 {%k5} {z} + +// CHECK: vpmovusqw %xmm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x11] + vpmovusqw %xmm18, (%rcx) + +// CHECK: vpmovusqw %xmm18, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x09,0x14,0x11] + vpmovusqw %xmm18, (%rcx) {%k1} + +// CHECK: vpmovusqw %xmm18, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x14,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqw %xmm18, 291(%rax,%r14,8) + +// CHECK: vpmovusqw %xmm18, 508(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x52,0x7f] + vpmovusqw %xmm18, 508(%rdx) + +// CHECK: vpmovusqw %xmm18, 512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x92,0x00,0x02,0x00,0x00] + vpmovusqw %xmm18, 512(%rdx) + +// CHECK: vpmovusqw %xmm18, -512(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x52,0x80] + vpmovusqw %xmm18, -512(%rdx) + +// CHECK: vpmovusqw %xmm18, -516(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x14,0x92,0xfc,0xfd,0xff,0xff] + vpmovusqw %xmm18, -516(%rdx) + +// CHECK: vpmovusqw %ymm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x11] + vpmovusqw %ymm18, (%rcx) + +// CHECK: vpmovusqw %ymm18, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2a,0x14,0x11] + vpmovusqw %ymm18, (%rcx) {%k2} + +// CHECK: vpmovusqw %ymm18, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x14,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqw %ymm18, 291(%rax,%r14,8) + +// CHECK: vpmovusqw %ymm18, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x52,0x7f] + vpmovusqw %ymm18, 1016(%rdx) + +// CHECK: vpmovusqw %ymm18, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x92,0x00,0x04,0x00,0x00] + vpmovusqw %ymm18, 1024(%rdx) + +// CHECK: vpmovusqw %ymm18, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x52,0x80] + vpmovusqw %ymm18, -1024(%rdx) + +// CHECK: vpmovusqw %ymm18, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x14,0x92,0xf8,0xfb,0xff,0xff] + vpmovusqw %ymm18, -1032(%rdx) + +// CHECK: vpmovqd %xmm25, %xmm21 +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x35,0xcd] + vpmovqd %xmm25, %xmm21 + +// CHECK: vpmovqd %xmm25, %xmm21 {%k5} +// CHECK: encoding: [0x62,0x22,0x7e,0x0d,0x35,0xcd] + vpmovqd %xmm25, %xmm21 {%k5} + +// CHECK: vpmovqd %xmm25, %xmm21 {%k5} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0x8d,0x35,0xcd] + vpmovqd %xmm25, %xmm21 {%k5} {z} + +// CHECK: vpmovqd %ymm22, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x35,0xf5] + vpmovqd %ymm22, %xmm21 + +// CHECK: vpmovqd %ymm22, %xmm21 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2e,0x35,0xf5] + vpmovqd %ymm22, %xmm21 {%k6} + +// CHECK: vpmovqd %ymm22, %xmm21 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xae,0x35,0xf5] + vpmovqd %ymm22, %xmm21 {%k6} {z} + +// CHECK: vpmovqd %xmm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0x29] + vpmovqd %xmm29, (%rcx) + +// CHECK: vpmovqd %xmm29, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x0e,0x35,0x29] + vpmovqd %xmm29, (%rcx) {%k6} + +// CHECK: vpmovqd %xmm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x35,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovqd %xmm29, 291(%rax,%r14,8) + +// CHECK: vpmovqd %xmm29, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0x6a,0x7f] + vpmovqd %xmm29, 1016(%rdx) + +// CHECK: vpmovqd %xmm29, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0xaa,0x00,0x04,0x00,0x00] + vpmovqd %xmm29, 1024(%rdx) + +// CHECK: vpmovqd %xmm29, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0x6a,0x80] + vpmovqd %xmm29, -1024(%rdx) + +// CHECK: vpmovqd %xmm29, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x35,0xaa,0xf8,0xfb,0xff,0xff] + vpmovqd %xmm29, -1032(%rdx) + +// CHECK: vpmovqd %ymm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0x31] + vpmovqd %ymm30, (%rcx) + +// CHECK: vpmovqd %ymm30, (%rcx) {%k2} +// CHECK: encoding: [0x62,0x62,0x7e,0x2a,0x35,0x31] + vpmovqd %ymm30, (%rcx) {%k2} + +// CHECK: vpmovqd %ymm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x35,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovqd %ymm30, 291(%rax,%r14,8) + +// CHECK: vpmovqd %ymm30, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0x72,0x7f] + vpmovqd %ymm30, 2032(%rdx) + +// CHECK: vpmovqd %ymm30, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0xb2,0x00,0x08,0x00,0x00] + vpmovqd %ymm30, 2048(%rdx) + +// CHECK: vpmovqd %ymm30, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0x72,0x80] + vpmovqd %ymm30, -2048(%rdx) + +// CHECK: vpmovqd %ymm30, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x35,0xb2,0xf0,0xf7,0xff,0xff] + vpmovqd %ymm30, -2064(%rdx) + +// CHECK: vpmovsqd %xmm21, %xmm21 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x25,0xed] + vpmovsqd %xmm21, %xmm21 + +// CHECK: vpmovsqd %xmm21, %xmm21 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0a,0x25,0xed] + vpmovsqd %xmm21, %xmm21 {%k2} + +// CHECK: vpmovsqd %xmm21, %xmm21 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8a,0x25,0xed] + vpmovsqd %xmm21, %xmm21 {%k2} {z} + +// CHECK: vpmovsqd %ymm29, %xmm29 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x25,0xed] + vpmovsqd %ymm29, %xmm29 + +// CHECK: vpmovsqd %ymm29, %xmm29 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x2c,0x25,0xed] + vpmovsqd %ymm29, %xmm29 {%k4} + +// CHECK: vpmovsqd %ymm29, %xmm29 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xac,0x25,0xed] + vpmovsqd %ymm29, %xmm29 {%k4} {z} + +// CHECK: vpmovsqd %xmm17, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x09] + vpmovsqd %xmm17, (%rcx) + +// CHECK: vpmovsqd %xmm17, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x25,0x09] + vpmovsqd %xmm17, (%rcx) {%k2} + +// CHECK: vpmovsqd %xmm17, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x25,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsqd %xmm17, 291(%rax,%r14,8) + +// CHECK: vpmovsqd %xmm17, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x4a,0x7f] + vpmovsqd %xmm17, 1016(%rdx) + +// CHECK: vpmovsqd %xmm17, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x8a,0x00,0x04,0x00,0x00] + vpmovsqd %xmm17, 1024(%rdx) + +// CHECK: vpmovsqd %xmm17, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x4a,0x80] + vpmovsqd %xmm17, -1024(%rdx) + +// CHECK: vpmovsqd %xmm17, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x25,0x8a,0xf8,0xfb,0xff,0xff] + vpmovsqd %xmm17, -1032(%rdx) + +// CHECK: vpmovsqd %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0x39] + vpmovsqd %ymm23, (%rcx) + +// CHECK: vpmovsqd %ymm23, (%rcx) {%k5} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2d,0x25,0x39] + vpmovsqd %ymm23, (%rcx) {%k5} + +// CHECK: vpmovsqd %ymm23, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x25,0xbc,0xf0,0x23,0x01,0x00,0x00] + vpmovsqd %ymm23, 291(%rax,%r14,8) + +// CHECK: vpmovsqd %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0x7a,0x7f] + vpmovsqd %ymm23, 2032(%rdx) + +// CHECK: vpmovsqd %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0xba,0x00,0x08,0x00,0x00] + vpmovsqd %ymm23, 2048(%rdx) + +// CHECK: vpmovsqd %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0x7a,0x80] + vpmovsqd %ymm23, -2048(%rdx) + +// CHECK: vpmovsqd %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x25,0xba,0xf0,0xf7,0xff,0xff] + vpmovsqd %ymm23, -2064(%rdx) + +// CHECK: vpmovusqd %xmm21, %xmm25 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x15,0xe9] + vpmovusqd %xmm21, %xmm25 + +// CHECK: vpmovusqd %xmm21, %xmm25 {%k5} +// CHECK: encoding: [0x62,0x82,0x7e,0x0d,0x15,0xe9] + vpmovusqd %xmm21, %xmm25 {%k5} + +// CHECK: vpmovusqd %xmm21, %xmm25 {%k5} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8d,0x15,0xe9] + vpmovusqd %xmm21, %xmm25 {%k5} {z} + +// CHECK: vpmovusqd %ymm21, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x15,0xec] + vpmovusqd %ymm21, %xmm20 + +// CHECK: vpmovusqd %ymm21, %xmm20 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2a,0x15,0xec] + vpmovusqd %ymm21, %xmm20 {%k2} + +// CHECK: vpmovusqd %ymm21, %xmm20 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaa,0x15,0xec] + vpmovusqd %ymm21, %xmm20 {%k2} {z} + +// CHECK: vpmovusqd %xmm18, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x11] + vpmovusqd %xmm18, (%rcx) + +// CHECK: vpmovusqd %xmm18, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x09,0x15,0x11] + vpmovusqd %xmm18, (%rcx) {%k1} + +// CHECK: vpmovusqd %xmm18, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x15,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusqd %xmm18, 291(%rax,%r14,8) + +// CHECK: vpmovusqd %xmm18, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x52,0x7f] + vpmovusqd %xmm18, 1016(%rdx) + +// CHECK: vpmovusqd %xmm18, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x92,0x00,0x04,0x00,0x00] + vpmovusqd %xmm18, 1024(%rdx) + +// CHECK: vpmovusqd %xmm18, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x52,0x80] + vpmovusqd %xmm18, -1024(%rdx) + +// CHECK: vpmovusqd %xmm18, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x15,0x92,0xf8,0xfb,0xff,0xff] + vpmovusqd %xmm18, -1032(%rdx) + +// CHECK: vpmovusqd %ymm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0x29] + vpmovusqd %ymm29, (%rcx) + +// CHECK: vpmovusqd %ymm29, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x15,0x29] + vpmovusqd %ymm29, (%rcx) {%k6} + +// CHECK: vpmovusqd %ymm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x15,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovusqd %ymm29, 291(%rax,%r14,8) + +// CHECK: vpmovusqd %ymm29, 2032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0x6a,0x7f] + vpmovusqd %ymm29, 2032(%rdx) + +// CHECK: vpmovusqd %ymm29, 2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0xaa,0x00,0x08,0x00,0x00] + vpmovusqd %ymm29, 2048(%rdx) + +// CHECK: vpmovusqd %ymm29, -2048(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0x6a,0x80] + vpmovusqd %ymm29, -2048(%rdx) + +// CHECK: vpmovusqd %ymm29, -2064(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x15,0xaa,0xf0,0xf7,0xff,0xff] + vpmovusqd %ymm29, -2064(%rdx) + +// CHECK: vpmovdb %xmm21, %xmm30 +// CHECK: encoding: [0x62,0x82,0x7e,0x08,0x31,0xee] + vpmovdb %xmm21, %xmm30 + +// CHECK: vpmovdb %xmm21, %xmm30 {%k3} +// CHECK: encoding: [0x62,0x82,0x7e,0x0b,0x31,0xee] + vpmovdb %xmm21, %xmm30 {%k3} + +// CHECK: vpmovdb %xmm21, %xmm30 {%k3} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0x8b,0x31,0xee] + vpmovdb %xmm21, %xmm30 {%k3} {z} + +// CHECK: vpmovdb %ymm21, %xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x31,0xef] + vpmovdb %ymm21, %xmm23 + +// CHECK: vpmovdb %ymm21, %xmm23 {%k4} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2c,0x31,0xef] + vpmovdb %ymm21, %xmm23 {%k4} + +// CHECK: vpmovdb %ymm21, %xmm23 {%k4} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xac,0x31,0xef] + vpmovdb %ymm21, %xmm23 {%k4} {z} + +// CHECK: vpmovdb %xmm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0x29] + vpmovdb %xmm29, (%rcx) + +// CHECK: vpmovdb %xmm29, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x31,0x29] + vpmovdb %xmm29, (%rcx) {%k3} + +// CHECK: vpmovdb %xmm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x31,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovdb %xmm29, 291(%rax,%r14,8) + +// CHECK: vpmovdb %xmm29, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0x6a,0x7f] + vpmovdb %xmm29, 508(%rdx) + +// CHECK: vpmovdb %xmm29, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0xaa,0x00,0x02,0x00,0x00] + vpmovdb %xmm29, 512(%rdx) + +// CHECK: vpmovdb %xmm29, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0x6a,0x80] + vpmovdb %xmm29, -512(%rdx) + +// CHECK: vpmovdb %xmm29, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x31,0xaa,0xfc,0xfd,0xff,0xff] + vpmovdb %xmm29, -516(%rdx) + +// CHECK: vpmovdb %ymm26, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x11] + vpmovdb %ymm26, (%rcx) + +// CHECK: vpmovdb %ymm26, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x31,0x11] + vpmovdb %ymm26, (%rcx) {%k6} + +// CHECK: vpmovdb %ymm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x31,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovdb %ymm26, 291(%rax,%r14,8) + +// CHECK: vpmovdb %ymm26, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x52,0x7f] + vpmovdb %ymm26, 1016(%rdx) + +// CHECK: vpmovdb %ymm26, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x92,0x00,0x04,0x00,0x00] + vpmovdb %ymm26, 1024(%rdx) + +// CHECK: vpmovdb %ymm26, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x52,0x80] + vpmovdb %ymm26, -1024(%rdx) + +// CHECK: vpmovdb %ymm26, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x31,0x92,0xf8,0xfb,0xff,0xff] + vpmovdb %ymm26, -1032(%rdx) + +// CHECK: vpmovsdb %xmm27, %xmm30 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x21,0xde] + vpmovsdb %xmm27, %xmm30 + +// CHECK: vpmovsdb %xmm27, %xmm30 {%k1} +// CHECK: encoding: [0x62,0x02,0x7e,0x09,0x21,0xde] + vpmovsdb %xmm27, %xmm30 {%k1} + +// CHECK: vpmovsdb %xmm27, %xmm30 {%k1} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x89,0x21,0xde] + vpmovsdb %xmm27, %xmm30 {%k1} {z} + +// CHECK: vpmovsdb %ymm27, %xmm26 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x21,0xda] + vpmovsdb %ymm27, %xmm26 + +// CHECK: vpmovsdb %ymm27, %xmm26 {%k3} +// CHECK: encoding: [0x62,0x02,0x7e,0x2b,0x21,0xda] + vpmovsdb %ymm27, %xmm26 {%k3} + +// CHECK: vpmovsdb %ymm27, %xmm26 {%k3} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xab,0x21,0xda] + vpmovsdb %ymm27, %xmm26 {%k3} {z} + +// CHECK: vpmovsdb %xmm30, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0x31] + vpmovsdb %xmm30, (%rcx) + +// CHECK: vpmovsdb %xmm30, (%rcx) {%k3} +// CHECK: encoding: [0x62,0x62,0x7e,0x0b,0x21,0x31] + vpmovsdb %xmm30, (%rcx) {%k3} + +// CHECK: vpmovsdb %xmm30, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x21,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovsdb %xmm30, 291(%rax,%r14,8) + +// CHECK: vpmovsdb %xmm30, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0x72,0x7f] + vpmovsdb %xmm30, 508(%rdx) + +// CHECK: vpmovsdb %xmm30, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0xb2,0x00,0x02,0x00,0x00] + vpmovsdb %xmm30, 512(%rdx) + +// CHECK: vpmovsdb %xmm30, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0x72,0x80] + vpmovsdb %xmm30, -512(%rdx) + +// CHECK: vpmovsdb %xmm30, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x21,0xb2,0xfc,0xfd,0xff,0xff] + vpmovsdb %xmm30, -516(%rdx) + +// CHECK: vpmovsdb %ymm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x09] + vpmovsdb %ymm25, (%rcx) + +// CHECK: vpmovsdb %ymm25, (%rcx) {%k5} +// CHECK: encoding: [0x62,0x62,0x7e,0x2d,0x21,0x09] + vpmovsdb %ymm25, (%rcx) {%k5} + +// CHECK: vpmovsdb %ymm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x21,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovsdb %ymm25, 291(%rax,%r14,8) + +// CHECK: vpmovsdb %ymm25, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x4a,0x7f] + vpmovsdb %ymm25, 1016(%rdx) + +// CHECK: vpmovsdb %ymm25, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x8a,0x00,0x04,0x00,0x00] + vpmovsdb %ymm25, 1024(%rdx) + +// CHECK: vpmovsdb %ymm25, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x4a,0x80] + vpmovsdb %ymm25, -1024(%rdx) + +// CHECK: vpmovsdb %ymm25, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x21,0x8a,0xf8,0xfb,0xff,0xff] + vpmovsdb %ymm25, -1032(%rdx) + +// CHECK: vpmovusdb %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x02,0x7e,0x08,0x11,0xee] + vpmovusdb %xmm29, %xmm30 + +// CHECK: vpmovusdb %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x02,0x7e,0x0f,0x11,0xee] + vpmovusdb %xmm29, %xmm30 {%k7} + +// CHECK: vpmovusdb %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0x8f,0x11,0xee] + vpmovusdb %xmm29, %xmm30 {%k7} {z} + +// CHECK: vpmovusdb %ymm17, %xmm23 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x11,0xcf] + vpmovusdb %ymm17, %xmm23 + +// CHECK: vpmovusdb %ymm17, %xmm23 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2e,0x11,0xcf] + vpmovusdb %ymm17, %xmm23 {%k6} + +// CHECK: vpmovusdb %ymm17, %xmm23 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xae,0x11,0xcf] + vpmovusdb %ymm17, %xmm23 {%k6} {z} + +// CHECK: vpmovusdb %xmm26, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x11] + vpmovusdb %xmm26, (%rcx) + +// CHECK: vpmovusdb %xmm26, (%rcx) {%k7} +// CHECK: encoding: [0x62,0x62,0x7e,0x0f,0x11,0x11] + vpmovusdb %xmm26, (%rcx) {%k7} + +// CHECK: vpmovusdb %xmm26, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x11,0x94,0xf0,0x23,0x01,0x00,0x00] + vpmovusdb %xmm26, 291(%rax,%r14,8) + +// CHECK: vpmovusdb %xmm26, 508(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x52,0x7f] + vpmovusdb %xmm26, 508(%rdx) + +// CHECK: vpmovusdb %xmm26, 512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x92,0x00,0x02,0x00,0x00] + vpmovusdb %xmm26, 512(%rdx) + +// CHECK: vpmovusdb %xmm26, -512(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x52,0x80] + vpmovusdb %xmm26, -512(%rdx) + +// CHECK: vpmovusdb %xmm26, -516(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x11,0x92,0xfc,0xfd,0xff,0xff] + vpmovusdb %xmm26, -516(%rdx) + +// CHECK: vpmovusdb %ymm25, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x09] + vpmovusdb %ymm25, (%rcx) + +// CHECK: vpmovusdb %ymm25, (%rcx) {%k6} +// CHECK: encoding: [0x62,0x62,0x7e,0x2e,0x11,0x09] + vpmovusdb %ymm25, (%rcx) {%k6} + +// CHECK: vpmovusdb %ymm25, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x28,0x11,0x8c,0xf0,0x23,0x01,0x00,0x00] + vpmovusdb %ymm25, 291(%rax,%r14,8) + +// CHECK: vpmovusdb %ymm25, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x4a,0x7f] + vpmovusdb %ymm25, 1016(%rdx) + +// CHECK: vpmovusdb %ymm25, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x8a,0x00,0x04,0x00,0x00] + vpmovusdb %ymm25, 1024(%rdx) + +// CHECK: vpmovusdb %ymm25, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x4a,0x80] + vpmovusdb %ymm25, -1024(%rdx) + +// CHECK: vpmovusdb %ymm25, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x28,0x11,0x8a,0xf8,0xfb,0xff,0xff] + vpmovusdb %ymm25, -1032(%rdx) + +// CHECK: vpmovdw %xmm25, %xmm17 +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x33,0xc9] + vpmovdw %xmm25, %xmm17 + +// CHECK: vpmovdw %xmm25, %xmm17 {%k5} +// CHECK: encoding: [0x62,0x22,0x7e,0x0d,0x33,0xc9] + vpmovdw %xmm25, %xmm17 {%k5} + +// CHECK: vpmovdw %xmm25, %xmm17 {%k5} {z} +// CHECK: encoding: [0x62,0x22,0x7e,0x8d,0x33,0xc9] + vpmovdw %xmm25, %xmm17 {%k5} {z} + +// CHECK: vpmovdw %ymm19, %xmm25 +// CHECK: encoding: [0x62,0x82,0x7e,0x28,0x33,0xd9] + vpmovdw %ymm19, %xmm25 + +// CHECK: vpmovdw %ymm19, %xmm25 {%k4} +// CHECK: encoding: [0x62,0x82,0x7e,0x2c,0x33,0xd9] + vpmovdw %ymm19, %xmm25 {%k4} + +// CHECK: vpmovdw %ymm19, %xmm25 {%k4} {z} +// CHECK: encoding: [0x62,0x82,0x7e,0xac,0x33,0xd9] + vpmovdw %ymm19, %xmm25 {%k4} {z} + +// CHECK: vpmovdw %xmm21, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0x29] + vpmovdw %xmm21, (%rcx) + +// CHECK: vpmovdw %xmm21, (%rcx) {%k2} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0a,0x33,0x29] + vpmovdw %xmm21, (%rcx) {%k2} + +// CHECK: vpmovdw %xmm21, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x33,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovdw %xmm21, 291(%rax,%r14,8) + +// CHECK: vpmovdw %xmm21, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0x6a,0x7f] + vpmovdw %xmm21, 1016(%rdx) + +// CHECK: vpmovdw %xmm21, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0xaa,0x00,0x04,0x00,0x00] + vpmovdw %xmm21, 1024(%rdx) + +// CHECK: vpmovdw %xmm21, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0x6a,0x80] + vpmovdw %xmm21, -1024(%rdx) + +// CHECK: vpmovdw %xmm21, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x33,0xaa,0xf8,0xfb,0xff,0xff] + vpmovdw %xmm21, -1032(%rdx) + +// CHECK: vpmovdw %ymm22, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0x31] + vpmovdw %ymm22, (%rcx) + +// CHECK: vpmovdw %ymm22, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2e,0x33,0x31] + vpmovdw %ymm22, (%rcx) {%k6} + +// CHECK: vpmovdw %ymm22, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x33,0xb4,0xf0,0x23,0x01,0x00,0x00] + vpmovdw %ymm22, 291(%rax,%r14,8) + +// CHECK: vpmovdw %ymm22, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0x72,0x7f] + vpmovdw %ymm22, 2032(%rdx) + +// CHECK: vpmovdw %ymm22, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0xb2,0x00,0x08,0x00,0x00] + vpmovdw %ymm22, 2048(%rdx) + +// CHECK: vpmovdw %ymm22, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0x72,0x80] + vpmovdw %ymm22, -2048(%rdx) + +// CHECK: vpmovdw %ymm22, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x33,0xb2,0xf0,0xf7,0xff,0xff] + vpmovdw %ymm22, -2064(%rdx) + +// CHECK: vpmovsdw %xmm18, %xmm18 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x23,0xd2] + vpmovsdw %xmm18, %xmm18 + +// CHECK: vpmovsdw %xmm18, %xmm18 {%k6} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0e,0x23,0xd2] + vpmovsdw %xmm18, %xmm18 {%k6} + +// CHECK: vpmovsdw %xmm18, %xmm18 {%k6} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8e,0x23,0xd2] + vpmovsdw %xmm18, %xmm18 {%k6} {z} + +// CHECK: vpmovsdw %ymm18, %xmm20 +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x23,0xd4] + vpmovsdw %ymm18, %xmm20 + +// CHECK: vpmovsdw %ymm18, %xmm20 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x2a,0x23,0xd4] + vpmovsdw %ymm18, %xmm20 {%k2} + +// CHECK: vpmovsdw %ymm18, %xmm20 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0xaa,0x23,0xd4] + vpmovsdw %ymm18, %xmm20 {%k2} {z} + +// CHECK: vpmovsdw %xmm29, (%rcx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0x29] + vpmovsdw %xmm29, (%rcx) + +// CHECK: vpmovsdw %xmm29, (%rcx) {%k1} +// CHECK: encoding: [0x62,0x62,0x7e,0x09,0x23,0x29] + vpmovsdw %xmm29, (%rcx) {%k1} + +// CHECK: vpmovsdw %xmm29, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0x22,0x7e,0x08,0x23,0xac,0xf0,0x23,0x01,0x00,0x00] + vpmovsdw %xmm29, 291(%rax,%r14,8) + +// CHECK: vpmovsdw %xmm29, 1016(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0x6a,0x7f] + vpmovsdw %xmm29, 1016(%rdx) + +// CHECK: vpmovsdw %xmm29, 1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0xaa,0x00,0x04,0x00,0x00] + vpmovsdw %xmm29, 1024(%rdx) + +// CHECK: vpmovsdw %xmm29, -1024(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0x6a,0x80] + vpmovsdw %xmm29, -1024(%rdx) + +// CHECK: vpmovsdw %xmm29, -1032(%rdx) +// CHECK: encoding: [0x62,0x62,0x7e,0x08,0x23,0xaa,0xf8,0xfb,0xff,0xff] + vpmovsdw %xmm29, -1032(%rdx) + +// CHECK: vpmovsdw %ymm19, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x19] + vpmovsdw %ymm19, (%rcx) + +// CHECK: vpmovsdw %ymm19, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x2e,0x23,0x19] + vpmovsdw %ymm19, (%rcx) {%k6} + +// CHECK: vpmovsdw %ymm19, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x23,0x9c,0xf0,0x23,0x01,0x00,0x00] + vpmovsdw %ymm19, 291(%rax,%r14,8) + +// CHECK: vpmovsdw %ymm19, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x5a,0x7f] + vpmovsdw %ymm19, 2032(%rdx) + +// CHECK: vpmovsdw %ymm19, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x9a,0x00,0x08,0x00,0x00] + vpmovsdw %ymm19, 2048(%rdx) + +// CHECK: vpmovsdw %ymm19, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x5a,0x80] + vpmovsdw %ymm19, -2048(%rdx) + +// CHECK: vpmovsdw %ymm19, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x23,0x9a,0xf0,0xf7,0xff,0xff] + vpmovsdw %ymm19, -2064(%rdx) + +// CHECK: vpmovusdw %xmm18, %xmm18 +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x13,0xd2] + vpmovusdw %xmm18, %xmm18 + +// CHECK: vpmovusdw %xmm18, %xmm18 {%k2} +// CHECK: encoding: [0x62,0xa2,0x7e,0x0a,0x13,0xd2] + vpmovusdw %xmm18, %xmm18 {%k2} + +// CHECK: vpmovusdw %xmm18, %xmm18 {%k2} {z} +// CHECK: encoding: [0x62,0xa2,0x7e,0x8a,0x13,0xd2] + vpmovusdw %xmm18, %xmm18 {%k2} {z} + +// CHECK: vpmovusdw %ymm25, %xmm28 +// CHECK: encoding: [0x62,0x02,0x7e,0x28,0x13,0xcc] + vpmovusdw %ymm25, %xmm28 + +// CHECK: vpmovusdw %ymm25, %xmm28 {%k4} +// CHECK: encoding: [0x62,0x02,0x7e,0x2c,0x13,0xcc] + vpmovusdw %ymm25, %xmm28 {%k4} + +// CHECK: vpmovusdw %ymm25, %xmm28 {%k4} {z} +// CHECK: encoding: [0x62,0x02,0x7e,0xac,0x13,0xcc] + vpmovusdw %ymm25, %xmm28 {%k4} {z} + +// CHECK: vpmovusdw %xmm20, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0x21] + vpmovusdw %xmm20, (%rcx) + +// CHECK: vpmovusdw %xmm20, (%rcx) {%k6} +// CHECK: encoding: [0x62,0xe2,0x7e,0x0e,0x13,0x21] + vpmovusdw %xmm20, (%rcx) {%k6} + +// CHECK: vpmovusdw %xmm20, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x08,0x13,0xa4,0xf0,0x23,0x01,0x00,0x00] + vpmovusdw %xmm20, 291(%rax,%r14,8) + +// CHECK: vpmovusdw %xmm20, 1016(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0x62,0x7f] + vpmovusdw %xmm20, 1016(%rdx) + +// CHECK: vpmovusdw %xmm20, 1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0xa2,0x00,0x04,0x00,0x00] + vpmovusdw %xmm20, 1024(%rdx) + +// CHECK: vpmovusdw %xmm20, -1024(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0x62,0x80] + vpmovusdw %xmm20, -1024(%rdx) + +// CHECK: vpmovusdw %xmm20, -1032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x08,0x13,0xa2,0xf8,0xfb,0xff,0xff] + vpmovusdw %xmm20, -1032(%rdx) + +// CHECK: vpmovusdw %ymm23, (%rcx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0x39] + vpmovusdw %ymm23, (%rcx) + +// CHECK: vpmovusdw %ymm23, (%rcx) {%k1} +// CHECK: encoding: [0x62,0xe2,0x7e,0x29,0x13,0x39] + vpmovusdw %ymm23, (%rcx) {%k1} + +// CHECK: vpmovusdw %ymm23, 291(%rax,%r14,8) +// CHECK: encoding: [0x62,0xa2,0x7e,0x28,0x13,0xbc,0xf0,0x23,0x01,0x00,0x00] + vpmovusdw %ymm23, 291(%rax,%r14,8) + +// CHECK: vpmovusdw %ymm23, 2032(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0x7a,0x7f] + vpmovusdw %ymm23, 2032(%rdx) + +// CHECK: vpmovusdw %ymm23, 2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0xba,0x00,0x08,0x00,0x00] + vpmovusdw %ymm23, 2048(%rdx) + +// CHECK: vpmovusdw %ymm23, -2048(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0x7a,0x80] + vpmovusdw %ymm23, -2048(%rdx) + +// CHECK: vpmovusdw %ymm23, -2064(%rdx) +// CHECK: encoding: [0x62,0xe2,0x7e,0x28,0x13,0xba,0xf0,0xf7,0xff,0xff] + vpmovusdw %ymm23, -2064(%rdx) + // CHECK: vrndscalepd $171, %xmm28, %xmm29 // CHECK: encoding: [0x62,0x03,0xfd,0x08,0x09,0xec,0xab] vrndscalepd $0xab, %xmm28, %xmm29