diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 312c8db16d9..4e71f368e6a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1434,8 +1434,6 @@ void X86TargetLowering::resetOperationActions() { if (!VT.is512BitVector()) continue; - setOperationAction(ISD::LOAD, VT, Promote); - AddPromotedToType (ISD::LOAD, VT, MVT::v8i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index d100e88454f..760b4ed120c 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -555,7 +555,7 @@ let Constraints = "$src1 = $dst" in { (bitconvert (mem_frag addr:$src3)))))]>, EVEX_4V; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv8i64, i512mem, +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1107,7 +1107,7 @@ def VMOVDQA64rm : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), } multiclass avx512_mov_int opc, string asm, RegisterClass RC, - RegisterClass KRC, PatFrag bc_frag, + RegisterClass KRC, PatFrag ld_frag, X86MemOperand x86memop> { let neverHasSideEffects = 1 in def rr : AVX512XSI, + [(set RC:$dst, (ld_frag addr:$src))]>, EVEX; let Constraints = "$src1 = $dst" in { def rrk : AVX512XSI, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, bc_v8i64, - memopv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMOVDQU32 : avx512_mov_int<0x6F, "vmovdqu32", VR512, VK16WM, memopv16i32, i512mem>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VMOVDQU64 : avx512_mov_int<0x6F, "vmovdqu64", VR512, VK8WM, memopv8i64, i512mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let AddedComplexity = 20 in { def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), @@ -1151,4 +1151,267 @@ def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), (v8i64 VR512:$src2))), (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; } +// Move Int Doubleword to Packed Double Int +// +def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + EVEX, VEX_LIG; +def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; +def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG; +def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, + EVEX_CD8<64, CD8VT1>; + +// Move Int Doubleword to Single Scalar +// +def VMOVDI2SSZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG; + +def VMOVDI2SSZrm : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + +// Move Packed Doubleword Int to Packed Double Int +// +def VMOVPDI2DIZrr : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, + EVEX, VEX_LIG; +def VMOVPDI2DIZmr : AVX512SI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128X:$src), + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + +// Move Packed Doubleword Int first element to Doubleword Int +// +def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>, + Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; + +// Move Scalar Single to Double Int +// +def VMOVSS2DIZrr : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), + (ins FR32X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32X:$src))], + IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG; +def VMOVSS2DIZmr : AVX512SI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, FR32X:$src), + "vmovd{z}\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + +// Move Quadword Int to Packed Quadword Int +// +def VMOVQI2PQIZrm : AVX512SI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, + EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 MOVSS, MOVSD +//===----------------------------------------------------------------------===// + +multiclass avx512_move_scalar { + def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128X:$dst, (vt (OpNode VR128X:$src1, + (scalar_to_vector RC:$src2))))], + IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; + def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, + EVEX, VEX_LIG; + def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + EVEX, VEX_LIG; +} + +let ExeDomain = SSEPackedSingle in +defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem, + loadf32>, XS, EVEX_CD8<32, CD8VT1>; + +let ExeDomain = SSEPackedDouble in +defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem, + loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; + + +// For the disassembler +let isCodeGenOnly = 1 in { + def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR32X:$src2), + "movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_MOV_S_RR>, + XS, EVEX_4V, VEX_LIG; + def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR64X:$src2), + "movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_MOV_S_RR>, + XD, EVEX_4V, VEX_LIG, VEX_W; +} + +let Predicates = [HasAVX512] in { + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128X)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128X)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128X)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + } + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVSDZrm addr:$src), sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; + def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>; + + // Shuffle with VMOVSS + def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), + (VMOVSSZrr (v4i32 VR128X:$src1), + (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>; + def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)), + (VMOVSSZrr (v4f32 VR128X:$src1), + (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>; + + // 256-bit variants + def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)), + sub_xmm)>; + + // Shuffle with VMOVSD + def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)), + sub_xmm)>; + + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; +} + +let AddedComplexity = 15 in +def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (v2i64 VR128X:$src))))], + IIC_SSE_MOVQ_RR>, EVEX, VEX_W; + +let AddedComplexity = 20 in +def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovq{z}\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, + EVEX_CD8<8, CD8VT8>; +let AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZPQILo2PQIZrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), + (VMOVZPQILo2PQIZrr VR128X:$src)>; +} diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 64018b3b57d..b50706c360f 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -339,10 +339,11 @@ def __xd : XD; class SI o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin = NoItinerary> : I { - let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [UseAVX], !if(!eq(Prefix, __xs.Prefix), [UseSSE1], !if(!eq(Prefix, __xd.Prefix), [UseSSE2], - !if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))); + !if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -352,8 +353,9 @@ class SI o, Format F, dag outs, dag ins, string asm, class SIi8 o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin = NoItinerary> : Ii8 { - let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [UseAVX], + !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -363,8 +365,9 @@ class SIi8 o, Format F, dag outs, dag ins, string asm, class PI o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin, Domain d> : I { - let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]))); // AVX instructions have a 'v' prefix in the mnemonic let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); @@ -381,11 +384,12 @@ class MMXPI o, Format F, dag outs, dag ins, string asm, list patter class PIi8 o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin, Domain d> : Ii8 { - let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], - !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])); + let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512], + !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]))); // AVX instructions have a 'v' prefix in the mnemonic - let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm); + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); } // SSE1 Instruction Templates: @@ -460,7 +464,7 @@ class PDIi8 o, Format F, dag outs, dag ins, string asm, class VSDI o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin = NoItinerary> : I, XD, - Requires<[HasAVX]>; + Requires<[UseAVX]>; class VS2SI o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin = NoItinerary> : I, XS, @@ -472,7 +476,7 @@ class VPDI o, Format F, dag outs, dag ins, string asm, class VS2I o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin = NoItinerary> : I, TB, - OpSize, Requires<[HasAVX]>; + OpSize, Requires<[UseAVX]>; class S2I o, Format F, dag outs, dag ins, string asm, list pattern, InstrItinClass itin = NoItinerary> : I, TB, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index fe35393fc58..e6460e972bc 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -348,6 +348,8 @@ def alignedloadv4i64 : PatFrag<(ops node:$ptr), // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (alignedload512 node:$ptr))>; +def alignedloadv16i32 : PatFrag<(ops node:$ptr), + (v16i32 (alignedload512 node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (alignedload512 node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), @@ -382,6 +384,7 @@ def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; // 512-bit memop pattern fragments def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>; def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>; +def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>; def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a @@ -437,7 +440,6 @@ def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; - def vzmovl_v2i64 : PatFrag<(ops node:$src), (bitconvert (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index b7737685ffc..71df2bb6c8f 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2910,17 +2910,20 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, // SrcReg(GR64) -> DestReg(VR64) bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); if (X86::GR64RegClass.contains(DestReg)) { - if (X86::VR128RegClass.contains(SrcReg)) + if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. - return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr; + return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr : + X86::MOVPQIto64rr); if (X86::VR64RegClass.contains(SrcReg)) // Copy from a VR64 register to a GR64 register. return X86::MOVSDto64rr; } else if (X86::GR64RegClass.contains(SrcReg)) { // Copy from a GR64 register to a VR128 register. - if (X86::VR128RegClass.contains(DestReg)) - return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr; + if (X86::VR128XRegClass.contains(DestReg)) + return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr : + X86::MOV64toPQIrr); // Copy from a GR64 register to a VR64 register. if (X86::VR64RegClass.contains(DestReg)) return X86::MOV64toSDrr; @@ -2929,13 +2932,13 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, // SrcReg(FR32) -> DestReg(GR32) // SrcReg(GR32) -> DestReg(FR32) - if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg)) + if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg)) // Copy from a FR32 register to a GR32 register. - return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr; + return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr); - if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) // Copy from a GR32 register to a FR32 register. - return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr; + return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr); return 0; } @@ -3040,6 +3043,21 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool isStackAligned, const TargetMachine &TM, bool load) { + if (TM.getSubtarget().hasAVX512()) { + if (X86::VK8RegClass.hasSubClassEq(RC) || + X86::VK16RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVWkm : X86::KMOVWmk; + + if (X86::FR32XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; + if (X86::FR64XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSDZrm : X86::VMOVSDZmr; + if (X86::VR128XRegClass.hasSubClassEq(RC) || + X86::VR256XRegClass.hasSubClassEq(RC) || + X86::VR512RegClass.hasSubClassEq(RC)) + return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; + } + bool HasAVX = TM.getSubtarget().hasAVX(); switch (RC->getSize()) { default: @@ -3099,6 +3117,12 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; else return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + case 64: + assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); + if (isStackAligned) + return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; + else + return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } } @@ -3125,7 +3149,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const MachineFunction &MF = *MBB.getParent(); assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); @@ -3141,7 +3165,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const { - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); @@ -3161,7 +3185,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); @@ -3175,7 +3199,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const { - unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a86006a7f62..aa057dba227 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4484,8 +4484,8 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), //===---------------------------------------------------------------------===// // Bitcast FR64 <-> GR64 // -let Predicates = [HasAVX] in -def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), +let Predicates = [UseAVX] in +def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, VEX, Sched<[WriteLoad]>; @@ -4577,7 +4577,7 @@ def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), IIC_SSE_MOVDQ>; } // AddedComplexity, SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), @@ -4630,7 +4630,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, - VEX, Requires<[HasAVX]>; + VEX, Requires<[UseAVX]>; def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -4674,7 +4674,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, - XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>; + XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>; let AddedComplexity = 20 in def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), @@ -4685,7 +4685,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; -let Predicates = [HasAVX], AddedComplexity = 20 in { +let Predicates = [UseAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (VMOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), @@ -4719,7 +4719,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, - XS, VEX, Requires<[HasAVX]>; + XS, VEX, Requires<[UseAVX]>; let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4735,7 +4735,7 @@ def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (loadv2i64 addr:$src))))], IIC_SSE_MOVDQ>, - XS, VEX, Requires<[HasAVX]>; + XS, VEX, Requires<[UseAVX]>; let AddedComplexity = 20 in { def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4747,7 +4747,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), } // SchedRW let AddedComplexity = 20 in { - let Predicates = [HasAVX] in { + let Predicates = [UseAVX] in { def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), (VMOVZPQILo2PQIrm addr:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), @@ -4771,7 +4771,7 @@ def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), IIC_SSE_MOVDQ>, VEX, VEX_W; // Recognize "movd" with GR64 destination, but encode as a "movq" def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "movd\t{$src, $dst|$dst, $src}", [], + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, VEX_W; } // SchedRW @@ -4780,7 +4780,7 @@ def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), // xm = mem64 let SchedRW = [WriteMove] in { -let Predicates = [HasAVX] in +let Predicates = [UseAVX] in def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll new file mode 100644 index 00000000000..3070862b857 --- /dev/null +++ b/test/CodeGen/X86/avx512-mov.ll @@ -0,0 +1,75 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; CHECK-LABEL: @test1 +; CHECK: vmovdz %xmm0, %eax +; CHECK: ret +define i32 @test1(float %x) { + %res = bitcast float %x to i32 + ret i32 %res +} + +; CHECK-LABEL: @test2 +; CHECK: vmovdz %edi +; CHECK: ret +define <4 x i32> @test2(i32 %x) { + %res = insertelement <4 x i32>undef, i32 %x, i32 0 + ret <4 x i32>%res +} + +; CHECK-LABEL: @test3 +; CHECK: vmovqz %rdi +; CHECK: ret +define <2 x i64> @test3(i64 %x) { + %res = insertelement <2 x i64>undef, i64 %x, i32 0 + ret <2 x i64>%res +} + +; CHECK-LABEL: @test4 +; CHECK: vmovdz (%rdi) +; CHECK: ret +define <4 x i32> @test4(i32* %x) { + %y = load i32* %x + %res = insertelement <4 x i32>undef, i32 %y, i32 0 + ret <4 x i32>%res +} + +; CHECK-LABEL: @test5 +; CHECK: vmovssz %xmm0, (%rdi) +; CHECK: ret +define void @test5(float %x, float* %y) { + store float %x, float* %y, align 4 + ret void +} + +; CHECK-LABEL: @test6 +; CHECK: vmovsdz %xmm0, (%rdi) +; CHECK: ret +define void @test6(double %x, double* %y) { + store double %x, double* %y, align 8 + ret void +} + +; CHECK-LABEL: @test7 +; CHECK: vmovssz (%rdi), %xmm0 +; CHECK: ret +define float @test7(i32* %x) { + %y = load i32* %x + %res = bitcast i32 %y to float + ret float %res +} + +; CHECK-LABEL: @test8 +; CHECK: vmovdz %xmm0, %eax +; CHECK: ret +define i32 @test8(<4 x i32> %x) { + %res = extractelement <4 x i32> %x, i32 0 + ret i32 %res +} + +; CHECK-LABEL: @test9 +; CHECK: vmovqz %xmm0, %rax +; CHECK: ret +define i64 @test9(<2 x i64> %x) { + %res = extractelement <2 x i64> %x, i32 0 + ret i64 %res +}