AVX-512: added VPERM2D VPERM2Q VPERM2PS VPERM2PD instructions,

they give better sequences than VPERMI


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199893 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2014-01-23 14:27:26 +00:00
parent 1334acd8c6
commit e1a621d84f
6 changed files with 171 additions and 158 deletions

View File

@ -7588,8 +7588,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
return DAG.getNode(X86ISD::VPERMV, dl, VT,
DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
return DAG.getNode(X86ISD::VPERMV3, dl, VT,
DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
}
//===--------------------------------------------------------------------===//
@ -14023,6 +14023,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";

View File

@ -339,6 +339,7 @@ namespace llvm {
VPERMILP,
VPERMV,
VPERMV3,
VPERMIV3,
VPERMI,
VPERM2X128,
VBROADCAST,

View File

@ -584,14 +584,14 @@ defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem,
// -- VPERM2I - 3 source operands form --
multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
PatFrag mem_frag, X86MemOperand x86memop,
ValueType OpVT> {
SDNode OpNode, ValueType OpVT> {
let Constraints = "$src1 = $dst" in {
def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
" \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
(OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>,
(OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
EVEX_4V;
def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
@ -599,19 +599,27 @@ let Constraints = "$src1 = $dst" in {
!strconcat(OpcodeStr,
" \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst,
(OpVT (X86VPermv3 RC:$src1, RC:$src2,
(OpVT (OpNode RC:$src1, RC:$src2,
(mem_frag addr:$src3))))]>, EVEX_4V;
}
}
defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem,
v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
X86VPermiv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem,
v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
X86VPermiv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, i512mem,
v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
X86VPermiv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, i512mem,
v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
X86VPermiv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPERM2D : avx512_perm_3src<0x7E, "vperm2d", VR512, memopv16i32, i512mem,
X86VPermv3, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERM2Q : avx512_perm_3src<0x7E, "vperm2q", VR512, memopv8i64, i512mem,
X86VPermv3, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPERM2PS : avx512_perm_3src<0x7F, "vperm2ps", VR512, memopv16f32, i512mem,
X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERM2PD : avx512_perm_3src<0x7F, "vperm2pd", VR512, memopv8f64, i512mem,
X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
//
@ -822,8 +830,8 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
[], d>, EVEX_B;
def rmi : AVX512PIi8<0xC2, MRMSrcMem,
(outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc),
!strconcat("vcmp", suffix,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
!strconcat("vcmp${cc}", suffix,
" \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set KRC:$dst,
(X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
@ -1200,12 +1208,12 @@ def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC,
X86MemOperand x86memop, PatFrag ld_frag,
string asm, Domain d> {
string asm, Domain d, bit IsReMaterializable = 1> {
let hasSideEffects = 0 in
def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
EVEX;
let canFoldAsLoad = 1 in
let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, " \t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (ld_frag addr:$src))], d>, EVEX;
@ -1234,7 +1242,7 @@ defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32,
"vmovups", SSEPackedSingle>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64,
"vmovupd", SSEPackedDouble>,
"vmovupd", SSEPackedDouble, 0>,
PD, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),

View File

@ -224,6 +224,7 @@ def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>;
def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>;
def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>;
def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>;
def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;

View File

@ -1540,8 +1540,8 @@ static bool isFrameLoadOpcode(int Opcode) {
case X86::VMOVDQAYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::VMOVDQA32rm:
case X86::VMOVDQA64rm:
case X86::VMOVAPSZrm:
case X86::VMOVUPSZrm:
return true;
}
}
@ -1567,6 +1567,8 @@ static bool isFrameStoreOpcode(int Opcode) {
case X86::VMOVAPSYmr:
case X86::VMOVAPDYmr:
case X86::VMOVDQAYmr:
case X86::VMOVUPSZmr:
case X86::VMOVAPSZmr:
case X86::MMX_MOVD64mr:
case X86::MMX_MOVQ64mr:
case X86::MMX_MOVNTQmr:

View File

@ -49,7 +49,7 @@ define <8 x double> @test4(<8 x double> %a) nounwind {
}
; CHECK-LABEL: test5:
; CHECK: vpermi2pd
; CHECK: vperm2pd
; CHECK: ret
define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
@ -65,7 +65,7 @@ define <8 x i64> @test6(<8 x i64> %a) nounwind {
}
; CHECK-LABEL: test7:
; CHECK: vpermi2q
; CHECK: vperm2q
; CHECK: ret
define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
%c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
@ -73,7 +73,7 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
}
; CHECK-LABEL: test8:
; CHECK: vpermi2d
; CHECK: vperm2d
; CHECK: ret
define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@ -81,7 +81,7 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
}
; CHECK-LABEL: test9:
; CHECK: vpermi2ps
; CHECK: vperm2ps
; CHECK: ret
define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
%c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@ -89,7 +89,7 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
}
; CHECK-LABEL: test10:
; CHECK: vpermi2ps (
; CHECK: vperm2ps (
; CHECK: ret
define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
%c = load <16 x float>* %b
@ -98,7 +98,7 @@ define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
}
; CHECK-LABEL: test11:
; CHECK: vpermi2d (
; CHECK: vperm2d
; CHECK: ret
define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
%c = load <16 x i32>* %b
@ -202,7 +202,7 @@ define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
}
; CHECK-LABEL: @test24
; CHECK: vpermi2d
; CHECK: vperm2d
; CHECK: ret
define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>