[X86][XOP] Enable commutation for XOP instructions

Patch to allow XOP instructions (integer comparison and integer multiply-add) to be commuted. The comparison instructions sometimes require the compare mode to be flipped but the remaining instructions can use default commutation modes.

This patch also sets the SSE domains of all the XOP instructions.

Differential Revision: http://reviews.llvm.org/D7646

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@229267 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Simon Pilgrim 2015-02-14 22:40:46 +00:00
parent 32f60795f5
commit 6d5ee8a8b5
3 changed files with 299 additions and 68 deletions

View File

@ -2906,6 +2906,32 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
return nullptr;
}
}
case X86::VPCOMBri: case X86::VPCOMUBri:
case X86::VPCOMDri: case X86::VPCOMUDri:
case X86::VPCOMQri: case X86::VPCOMUQri:
case X86::VPCOMWri: case X86::VPCOMUWri: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {
case 0x00: Imm = 0x02; break; // LT -> GT
case 0x01: Imm = 0x03; break; // LE -> GE
case 0x02: Imm = 0x00; break; // GT -> LT
case 0x03: Imm = 0x01; break; // GE -> LE
case 0x04: // EQ
case 0x05: // NE
case 0x06: // FALSE
case 0x07: // TRUE
default:
break;
}
if (NewMI) {
MachineFunction &MF = *MI->getParent()->getParent();
MI = MF.CloneMachineInstr(MI);
NewMI = false;
}
MI->getOperand(3).setImm(Imm);
return TargetInstrInfo::commuteInstruction(MI, NewMI);
}
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:

View File

@ -20,21 +20,23 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
}
defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
let ExeDomain = SSEPackedInt in {
defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
}
// Scalar load 2 addr operand instructions
multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
@ -47,11 +49,6 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
[(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
}
defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
ssmem, sse_load_f32>;
defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
sdmem, sse_load_f64>;
multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
PatFrag memop> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@ -62,9 +59,6 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
[(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
}
defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
PatFrag memop> {
def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
@ -75,8 +69,19 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
[(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
}
defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
let ExeDomain = SSEPackedSingle in {
defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
ssmem, sse_load_f32>;
defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
}
let ExeDomain = SSEPackedDouble in {
defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
sdmem, sse_load_f64>;
defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
}
multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
@ -97,18 +102,20 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
XOP_4VOp3;
}
defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
let ExeDomain = SSEPackedInt in {
defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
}
multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
@ -122,13 +129,16 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
(Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP;
}
defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
let ExeDomain = SSEPackedInt in {
defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
}
// Instruction where second source can be memory, but third must be register
multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
let isCommutable = 1 in
def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@ -144,21 +154,24 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
VR128:$src3))]>, XOP_4V, VEX_I8IMM;
}
defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
let ExeDomain = SSEPackedInt in {
defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
}
// Instruction where second source can be memory, third must be imm8
multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
let isCommutable = 1 in
def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, XOPCC:$cc),
!strconcat("vpcom${cc}", Suffix,
@ -187,14 +200,16 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> {
}
}
defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
let ExeDomain = SSEPackedInt in { // SSE integer instructions
defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>;
defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>;
defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>;
defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>;
defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>;
defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>;
defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>;
defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>;
}
// Instruction where either second or third source can be memory
multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@ -222,8 +237,10 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
XOP_4V, VEX_I8IMM;
}
defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
let ExeDomain = SSEPackedInt in {
defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
}
multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
@ -250,7 +267,8 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
XOP_4V, VEX_I8IMM, VEX_L;
}
defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
let ExeDomain = SSEPackedInt in
defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
@ -295,8 +313,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
VEX_L;
}
defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
let ExeDomain = SSEPackedDouble in
defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
let ExeDomain = SSEPackedSingle in
defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;

View File

@ -0,0 +1,184 @@
; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s
define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) {
;CHECK-LABEL: commute_fold_vpcomb
;CHECK: vpcomgtb (%rdi), %xmm0, %xmm0
%1 = load <16 x i8>* %a0
%2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb
ret <16 x i8> %2
}
declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) {
;CHECK-LABEL: commute_fold_vpcomd
;CHECK: vpcomged (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) {
;CHECK-LABEL: commute_fold_vpcomq
;CHECK: vpcomltq (%rdi), %xmm0, %xmm0
%1 = load <2 x i64>* %a0
%2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq
ret <2 x i64> %2
}
declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) {
;CHECK-LABEL: commute_fold_vpcomub
;CHECK: vpcomleub (%rdi), %xmm0, %xmm0
%1 = load <16 x i8>* %a0
%2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub
ret <16 x i8> %2
}
declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) {
;CHECK-LABEL: commute_fold_vpcomud
;CHECK: vpcomequd (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) {
;CHECK-LABEL: commute_fold_vpcomuq
;CHECK: vpcomnequq (%rdi), %xmm0, %xmm0
%1 = load <2 x i64>* %a0
%2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq
ret <2 x i64> %2
}
declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) {
;CHECK-LABEL: commute_fold_vpcomuw
;CHECK: vpcomfalseuw (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw
ret <8 x i16> %2
}
declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) {
;CHECK-LABEL: commute_fold_vpcomw
;CHECK: vpcomtruew (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew
ret <8 x i16> %2
}
declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
;CHECK-LABEL: commute_fold_vpmacsdd
;CHECK: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
;CHECK-LABEL: commute_fold_vpmacsdqh
;CHECK: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
ret <2 x i64> %2
}
declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
;CHECK-LABEL: commute_fold_vpmacsdql
;CHECK: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
ret <2 x i64> %2
}
declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
;CHECK-LABEL: commute_fold_vpmacssdd
;CHECK: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
;CHECK-LABEL: commute_fold_vpmacssdqh
;CHECK: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
ret <2 x i64> %2
}
declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
;CHECK-LABEL: commute_fold_vpmacssdql
;CHECK: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <4 x i32>* %a0
%2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
ret <2 x i64> %2
}
declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
;CHECK-LABEL: commute_fold_vpmacsswd
;CHECK: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
;CHECK-LABEL: commute_fold_vpmacssww
;CHECK: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
ret <8 x i16> %2
}
declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
;CHECK-LABEL: commute_fold_vpmacswd
;CHECK: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
;CHECK-LABEL: commute_fold_vpmacsww
;CHECK: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
ret <8 x i16> %2
}
declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
;CHECK-LABEL: commute_fold_vpmadcsswd
;CHECK: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
;CHECK-LABEL: commute_fold_vpmadcswd
;CHECK: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
%1 = load <8 x i16>* %a0
%2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone