AVX-512: Added VMOVD, VMOVQ, VMOVSS, VMOVSD instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188637 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Elena Demikhovsky 2013-08-18 13:08:57 +00:00
parent ff79bc6e18
commit 3491d67d3a
7 changed files with 410 additions and 44 deletions

View File

@ -1434,8 +1434,6 @@ void X86TargetLowering::resetOperationActions() {
if (!VT.is512BitVector())
continue;
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType (ISD::LOAD, VT, MVT::v8i64);
setOperationAction(ISD::SELECT, VT, Promote);
AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
}

View File

@ -555,7 +555,7 @@ let Constraints = "$src1 = $dst" in {
(bitconvert (mem_frag addr:$src3)))))]>, EVEX_4V;
}
}
defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv8i64, i512mem,
defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem,
v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem,
v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
@ -1107,7 +1107,7 @@ def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
}
multiclass avx512_mov_int<bits<8> opc, string asm, RegisterClass RC,
RegisterClass KRC, PatFrag bc_frag,
RegisterClass KRC,
PatFrag ld_frag, X86MemOperand x86memop> {
let neverHasSideEffects = 1 in
def rr : AVX512XSI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
@ -1116,7 +1116,7 @@ let neverHasSideEffects = 1 in
let canFoldAsLoad = 1 in
def rm : AVX512XSI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (bc_frag (ld_frag addr:$src)))]>,
[(set RC:$dst, (ld_frag addr:$src))]>,
EVEX;
let Constraints = "$src1 = $dst" in {
def rrk : AVX512XSI<opc, MRMSrcReg, (outs RC:$dst),
@ -1132,10 +1132,10 @@ let Constraints = "$src1 = $dst" in {
}
}
defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, bc_v16i32,
memopv8i64, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, bc_v8i64,
memopv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, memopv16i32, i512mem>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, memopv8i64, i512mem>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
let AddedComplexity = 20 in {
def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1),
@ -1151,4 +1151,267 @@ def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1),
(v8i64 VR512:$src2))),
(VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
}
// Move Int Doubleword to Packed Double Int
//
def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
EVEX, VEX_LIG;
def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector GR64:$src)))],
IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert GR64:$src))],
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))],
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
EVEX_CD8<64, CD8VT1>;
// Move Int Doubleword to Single Scalar
//
def VMOVDI2SSZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert GR32:$src))],
IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
def VMOVDI2SSZrm : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
// Move Packed Doubleword Int to Packed Double Int
//
def VMOVPDI2DIZrr : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
EVEX, VEX_LIG;
def VMOVPDI2DIZmr : AVX512SI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(store (i32 (vector_extract (v4i32 VR128X:$src),
(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
// Move Packed Doubleword Int first element to Doubleword Int
//
def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
(iPTR 0)))],
IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W,
Requires<[HasAVX512, In64BitMode]>;
def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs),
(ins i64mem:$dst, VR128X:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
addr:$dst)], IIC_SSE_MOVDQ>,
EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>,
Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
// Move Scalar Single to Double Int
//
def VMOVSS2DIZrr : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst),
(ins FR32X:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))],
IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
def VMOVSS2DIZmr : AVX512SI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, FR32X:$src),
"vmovd{z}\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
// Move Quadword Int to Packed Quadword Int
//
def VMOVQI2PQIZrm : AVX512SI<0x7E, MRMSrcMem, (outs VR128X:$dst),
(ins i64mem:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar <string asm, RegisterClass RC,
SDNode OpNode, ValueType vt,
X86MemOperand x86memop, PatFrag mem_pat> {
def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128X:$dst, (vt (OpNode VR128X:$src1,
(scalar_to_vector RC:$src2))))],
IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
EVEX, VEX_LIG;
def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
EVEX, VEX_LIG;
}
let ExeDomain = SSEPackedSingle in
defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem,
loadf32>, XS, EVEX_CD8<32, CD8VT1>;
let ExeDomain = SSEPackedDouble in
defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem,
loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
// For the disassembler
let isCodeGenOnly = 1 in {
def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, FR32X:$src2),
"movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
IIC_SSE_MOV_S_RR>,
XS, EVEX_4V, VEX_LIG;
def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, FR64X:$src2),
"movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
IIC_SSE_MOV_S_RR>,
XD, EVEX_4V, VEX_LIG, VEX_W;
}
let Predicates = [HasAVX512] in {
let AddedComplexity = 20 in {
// MOVSSrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128X)>;
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128X)>;
def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128X)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>;
def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>;
def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>;
def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>;
def : Pat<(v2f64 (X86vzload addr:$src)),
(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
}
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i64 0), (VMOVSDZrm addr:$src), sub_xmm)>;
// Extract and store.
def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
addr:$dst),
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
addr:$dst),
(VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
// Shuffle with VMOVSS
def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
(VMOVSSZrr (v4i32 VR128X:$src1),
(COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
(VMOVSSZrr (v4f32 VR128X:$src1),
(COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
// 256-bit variants
def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
(EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
sub_xmm)>;
def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
(EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
sub_xmm)>;
// Shuffle with VMOVSD
def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
// 256-bit variants
def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
(EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
sub_xmm)>;
def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
(SUBREG_TO_REG (i32 0),
(VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
(EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
sub_xmm)>;
def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
}
let AddedComplexity = 15 in
def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst, (v2i64 (X86vzmovl
(v2i64 VR128X:$src))))],
IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
let AddedComplexity = 20 in
def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
(ins i128mem:$src),
"vmovq{z}\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst, (v2i64 (X86vzmovl
(loadv2i64 addr:$src))))],
IIC_SSE_MOVDQ>, EVEX, VEX_W,
EVEX_CD8<8, CD8VT8>;
let AddedComplexity = 20 in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(VMOVZPQILo2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
}

View File

@ -339,10 +339,11 @@ def __xd : XD;
class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
!if(hasVEXPrefix /* VEX */, [UseAVX],
!if(!eq(Prefix, __xs.Prefix), [UseSSE1],
!if(!eq(Prefix, __xd.Prefix), [UseSSE2],
!if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))));
!if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))));
// AVX instructions have a 'v' prefix in the mnemonic
let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@ -352,8 +353,9 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
!if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
!if(hasVEXPrefix /* VEX */, [UseAVX],
!if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])));
// AVX instructions have a 'v' prefix in the mnemonic
let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@ -363,8 +365,9 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
InstrItinClass itin, Domain d>
: I<o, F, outs, ins, asm, pattern, itin, d> {
let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
!if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
!if(hasVEXPrefix /* VEX */, [HasAVX],
!if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
// AVX instructions have a 'v' prefix in the mnemonic
let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@ -381,11 +384,12 @@ class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> patter
class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin, Domain d>
: Ii8<o, F, outs, ins, asm, pattern, itin, d> {
let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
!if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
!if(hasVEXPrefix /* VEX */, [HasAVX],
!if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
// AVX instructions have a 'v' prefix in the mnemonic
let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
}
// SSE1 Instruction Templates:
@ -460,7 +464,7 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
Requires<[HasAVX]>;
Requires<[UseAVX]>;
class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@ -472,7 +476,7 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
OpSize, Requires<[HasAVX]>;
OpSize, Requires<[UseAVX]>;
class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, TB,

View File

@ -348,6 +348,8 @@ def alignedloadv4i64 : PatFrag<(ops node:$ptr),
// 512-bit aligned load pattern fragments
def alignedloadv16f32 : PatFrag<(ops node:$ptr),
(v16f32 (alignedload512 node:$ptr))>;
def alignedloadv16i32 : PatFrag<(ops node:$ptr),
(v16i32 (alignedload512 node:$ptr))>;
def alignedloadv8f64 : PatFrag<(ops node:$ptr),
(v8f64 (alignedload512 node:$ptr))>;
def alignedloadv8i64 : PatFrag<(ops node:$ptr),
@ -382,6 +384,7 @@ def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
// 512-bit memop pattern fragments
def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>;
def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>;
def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>;
def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>;
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
@ -437,7 +440,6 @@ def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
def vzmovl_v2i64 : PatFrag<(ops node:$src),
(bitconvert (v2i64 (X86vzmovl
(v2i64 (scalar_to_vector (loadi64 node:$src))))))>;

View File

@ -2910,17 +2910,20 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
// SrcReg(GR64) -> DestReg(VR64)
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
if (X86::GR64RegClass.contains(DestReg)) {
if (X86::VR128RegClass.contains(SrcReg))
if (X86::VR128XRegClass.contains(SrcReg))
// Copy from a VR128 register to a GR64 register.
return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
X86::MOVPQIto64rr);
if (X86::VR64RegClass.contains(SrcReg))
// Copy from a VR64 register to a GR64 register.
return X86::MOVSDto64rr;
} else if (X86::GR64RegClass.contains(SrcReg)) {
// Copy from a GR64 register to a VR128 register.
if (X86::VR128RegClass.contains(DestReg))
return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
if (X86::VR128XRegClass.contains(DestReg))
return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
X86::MOV64toPQIrr);
// Copy from a GR64 register to a VR64 register.
if (X86::VR64RegClass.contains(DestReg))
return X86::MOV64toSDrr;
@ -2929,13 +2932,13 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
// SrcReg(FR32) -> DestReg(GR32)
// SrcReg(GR32) -> DestReg(FR32)
if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
// Copy from a FR32 register to a GR32 register.
return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
// Copy from a GR32 register to a FR32 register.
return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
return 0;
}
@ -3040,6 +3043,21 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
bool isStackAligned,
const TargetMachine &TM,
bool load) {
if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
if (X86::VK8RegClass.hasSubClassEq(RC) ||
X86::VK16RegClass.hasSubClassEq(RC))
return load ? X86::KMOVWkm : X86::KMOVWmk;
if (X86::FR32XRegClass.hasSubClassEq(RC))
return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
if (X86::FR64XRegClass.hasSubClassEq(RC))
return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
if (X86::VR128XRegClass.hasSubClassEq(RC) ||
X86::VR256XRegClass.hasSubClassEq(RC) ||
X86::VR512RegClass.hasSubClassEq(RC))
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
switch (RC->getSize()) {
default:
@ -3099,6 +3117,12 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
else
return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
if (isStackAligned)
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
}
@ -3125,7 +3149,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const MachineFunction &MF = *MBB.getParent();
assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
"Stack slot too small for store");
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@ -3141,7 +3165,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = MMOBegin != MMOEnd &&
(*MMOBegin)->getAlignment() >= Alignment;
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@ -3161,7 +3185,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@ -3175,7 +3199,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = MMOBegin != MMOEnd &&
(*MMOBegin)->getAlignment() >= Alignment;
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);

View File

@ -4484,8 +4484,8 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
//===---------------------------------------------------------------------===//
// Bitcast FR64 <-> GR64
//
let Predicates = [HasAVX] in
def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
let Predicates = [UseAVX] in
def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
VEX, Sched<[WriteLoad]>;
@ -4577,7 +4577,7 @@ def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
IIC_SSE_MOVDQ>;
} // AddedComplexity, SchedRW
let Predicates = [HasAVX] in {
let Predicates = [UseAVX] in {
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
let AddedComplexity = 20 in {
def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@ -4630,7 +4630,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
VEX, Requires<[HasAVX]>;
VEX, Requires<[UseAVX]>;
def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
@ -4674,7 +4674,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
(v2i64 (X86vzmovl (v2i64 (scalar_to_vector
(loadi64 addr:$src))))))],
IIC_SSE_MOVDQ>,
XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
let AddedComplexity = 20 in
def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
@ -4685,7 +4685,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
IIC_SSE_MOVDQ>,
XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
let Predicates = [HasAVX], AddedComplexity = 20 in {
let Predicates = [UseAVX], AddedComplexity = 20 in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(VMOVZQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
@ -4719,7 +4719,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
IIC_SSE_MOVQ_RR>,
XS, VEX, Requires<[HasAVX]>;
XS, VEX, Requires<[UseAVX]>;
let AddedComplexity = 15 in
def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movq\t{$src, $dst|$dst, $src}",
@ -4735,7 +4735,7 @@ def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
[(set VR128:$dst, (v2i64 (X86vzmovl
(loadv2i64 addr:$src))))],
IIC_SSE_MOVDQ>,
XS, VEX, Requires<[HasAVX]>;
XS, VEX, Requires<[UseAVX]>;
let AddedComplexity = 20 in {
def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movq\t{$src, $dst|$dst, $src}",
@ -4747,7 +4747,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
} // SchedRW
let AddedComplexity = 20 in {
let Predicates = [HasAVX] in {
let Predicates = [UseAVX] in {
def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
(VMOVZPQILo2PQIrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@ -4771,7 +4771,7 @@ def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
IIC_SSE_MOVDQ>, VEX, VEX_W;
// Recognize "movd" with GR64 destination, but encode as a "movq"
def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
"movd\t{$src, $dst|$dst, $src}", [],
"movq\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVDQ>, VEX, VEX_W;
} // SchedRW
@ -4780,7 +4780,7 @@ def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
// xm = mem64
let SchedRW = [WriteMove] in {
let Predicates = [HasAVX] in
let Predicates = [UseAVX] in
def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),

View File

@ -0,0 +1,75 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
; CHECK-LABEL: @test1
; CHECK: vmovdz %xmm0, %eax
; CHECK: ret
define i32 @test1(float %x) {
%res = bitcast float %x to i32
ret i32 %res
}
; CHECK-LABEL: @test2
; CHECK: vmovdz %edi
; CHECK: ret
define <4 x i32> @test2(i32 %x) {
%res = insertelement <4 x i32>undef, i32 %x, i32 0
ret <4 x i32>%res
}
; CHECK-LABEL: @test3
; CHECK: vmovqz %rdi
; CHECK: ret
define <2 x i64> @test3(i64 %x) {
%res = insertelement <2 x i64>undef, i64 %x, i32 0
ret <2 x i64>%res
}
; CHECK-LABEL: @test4
; CHECK: vmovdz (%rdi)
; CHECK: ret
define <4 x i32> @test4(i32* %x) {
%y = load i32* %x
%res = insertelement <4 x i32>undef, i32 %y, i32 0
ret <4 x i32>%res
}
; CHECK-LABEL: @test5
; CHECK: vmovssz %xmm0, (%rdi)
; CHECK: ret
define void @test5(float %x, float* %y) {
store float %x, float* %y, align 4
ret void
}
; CHECK-LABEL: @test6
; CHECK: vmovsdz %xmm0, (%rdi)
; CHECK: ret
define void @test6(double %x, double* %y) {
store double %x, double* %y, align 8
ret void
}
; CHECK-LABEL: @test7
; CHECK: vmovssz (%rdi), %xmm0
; CHECK: ret
define float @test7(i32* %x) {
%y = load i32* %x
%res = bitcast i32 %y to float
ret float %res
}
; CHECK-LABEL: @test8
; CHECK: vmovdz %xmm0, %eax
; CHECK: ret
define i32 @test8(<4 x i32> %x) {
%res = extractelement <4 x i32> %x, i32 0
ret i32 %res
}
; CHECK-LABEL: @test9
; CHECK: vmovqz %xmm0, %rax
; CHECK: ret
define i64 @test9(<2 x i64> %x) {
%res = extractelement <2 x i64> %x, i32 0
ret i64 %res
}