From e3809eed34f000581a464689596eefde2a6d1f24 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 24 Jul 2013 11:02:47 +0000 Subject: [PATCH] I'm starting to commit KNL backend. I'll push patches one-by-one. This patch includes support for the extended register set XMM16-31, YMM16-31, ZMM0-31. The full ISA you can see here: http://software.intel.com/en-us/intel-isa-extensions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187030 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86.td | 19 +++++++ lib/Target/X86/X86CallingConv.td | 40 +++++++++++++-- lib/Target/X86/X86ISelLowering.cpp | 15 +++++- lib/Target/X86/X86RegisterInfo.cpp | 28 ++++++++++ lib/Target/X86/X86RegisterInfo.h | 3 ++ lib/Target/X86/X86RegisterInfo.td | 82 +++++++++++++++++++++++------- lib/Target/X86/X86Subtarget.h | 15 +++++- lib/Target/X86/X86VZeroUpper.cpp | 15 ++++-- 8 files changed, 190 insertions(+), 27 deletions(-) diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index c865500deb3..fbf531d8dff 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -86,6 +86,16 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512", + "Enable AVX-512 instructions", + [FeatureAVX2]>; +def FeatureERI : SubtargetFeature<"avx-512-eri", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions">; +def FeatureCDI : SubtargetFeature<"avx-512-cdi", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions">; +def FeaturePFI : SubtargetFeature<"avx-512-pfi", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions">; + def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -227,6 +237,15 @@ def : ProcessorModel<"core-avx2", HaswellModel, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; +// KNL +// FIXME: define KNL model +def : ProcessorModel<"knl", HaswellModel, + [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, + FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, + FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, + FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>; + def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; def : Proc<"k6-3", [Feature3DNow]>; diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 40c5d91b605..38e25910b2c 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX-512 target feature. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, @@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + // 512-bit FP vectors + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + // i32, i64 in the standard way CCDelegateTo ]>; @@ -213,10 +223,15 @@ def CC_X86_64_C : CallingConv<[ // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. CCIfNotVarArg>>>, + // The first 8 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg>>>, + // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, @@ -230,7 +245,11 @@ def CC_X86_64_C : CallingConv<[ // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToStack<32, 32>> + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> ]>; // Calling convention used on Win64 @@ -251,6 +270,9 @@ def CC_X86_Win64_C : CallingConv<[ // 256 bit vectors are passed by pointer CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect>, + // 512 bit vectors are passed by pointer + CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect>, + // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType>, @@ -345,7 +367,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 4 AVX 256-bit vector arguments are passed in YMM registers. CCIfNotVarArg>>>, // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. @@ -469,6 +491,10 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v8f32, v4f64, v8i32, v4i64], CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo @@ -535,6 +561,10 @@ def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "YMM%u", 6, 15))>; +def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, + R12, R13, R14, R15, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; //Standard C + XMM 8-15 def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, (sequence "XMM%u", 8, 15))>; @@ -542,3 +572,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, //Standard C + YMM 8-15 def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, (sequence "YMM%u", 8, 15))>; + +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add CSR_64, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8f8024356f5..e75781e6ba0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -18595,6 +18595,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v8f32: case MVT::v4f64: return std::make_pair(0U, &X86::VR256RegClass); + case MVT::v8f64: + case MVT::v16f32: + case MVT::v16i32: + case MVT::v8i64: + return std::make_pair(0U, &X86::VR512RegClass); } break; } @@ -18705,7 +18710,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } else if (Res.second == &X86::FR32RegClass || Res.second == &X86::FR64RegClass || - Res.second == &X86::VR128RegClass) { + Res.second == &X86::VR128RegClass || + Res.second == &X86::VR256RegClass || + Res.second == &X86::FR32XRegClass || + Res.second == &X86::FR64XRegClass || + Res.second == &X86::VR128XRegClass || + Res.second == &X86::VR256XRegClass || + Res.second == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -18719,6 +18730,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Res.second = &X86::VR128RegClass; else if (X86::VR256RegClass.hasType(VT)) Res.second = &X86::VR256RegClass; + else if (X86::VR512RegClass.hasType(VT)) + Res.second = &X86::VR512RegClass; } return Res; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index eacae2c83b6..d22db105389 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -241,6 +241,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::Intel_OCL_BI: { bool HasAVX = TM.getSubtarget().hasAVX(); + bool HasAVX512 = TM.getSubtarget().hasAVX512(); + if (HasAVX512 && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; + if (HasAVX512 && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_SaveList; if (HasAVX && Is64Bit) @@ -275,8 +280,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { bool HasAVX = TM.getSubtarget().hasAVX(); + bool HasAVX512 = TM.getSubtarget().hasAVX512(); if (CC == CallingConv::Intel_OCL_BI) { + if (IsWin64 && HasAVX512) + return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; + if (Is64Bit && HasAVX512) + return CSR_64_Intel_OCL_BI_AVX512_RegMask; if (IsWin64 && HasAVX) return CSR_Win64_Intel_OCL_BI_AVX_RegMask; if (Is64Bit && HasAVX) @@ -380,6 +390,12 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } } + if (!Is64Bit || !TM.getSubtarget().hasAVX512()) { + for (unsigned n = 16; n != 32; ++n) { + for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } return Reserved; } @@ -690,4 +706,16 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } } } + +unsigned get512BitSuperRegister(unsigned Reg) { + if (Reg >= X86::XMM0 && Reg <= X86::XMM31) + return X86::ZMM0 + (Reg - X86::XMM0); + if (Reg >= X86::YMM0 && Reg <= X86::YMM31) + return X86::ZMM0 + (Reg - X86::YMM0); + if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) + return Reg; + llvm_unreachable("Unexpected SIMD register"); + return 0; +} + } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 6a1b3282721..fb1768214e9 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -137,6 +137,9 @@ public: // e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); +//get512BitRegister - X86 utility - returns 512-bit super register +unsigned get512BitSuperRegister(unsigned Reg); + } // End llvm namespace #endif diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index fbbb2575f65..b8027283cc1 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -26,6 +26,7 @@ let Namespace = "X86" in { def sub_16bit : SubRegIndex<16>; def sub_32bit : SubRegIndex<32>; def sub_xmm : SubRegIndex<128>; + def sub_ymm : SubRegIndex<256>; } //===----------------------------------------------------------------------===// @@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; + +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>; + } // CostPerUse -// YMM Registers, used by AVX instructions +// YMM0-15 registers, used by AVX instructions and +// YMM16-31 registers, used by AVX-512 instructions. let SubRegIndices = [sub_xmm] in { -def YMM0: X86Reg<"ymm0", 0, [XMM0]>, DwarfRegAlias; -def YMM1: X86Reg<"ymm1", 1, [XMM1]>, DwarfRegAlias; -def YMM2: X86Reg<"ymm2", 2, [XMM2]>, DwarfRegAlias; -def YMM3: X86Reg<"ymm3", 3, [XMM3]>, DwarfRegAlias; -def YMM4: X86Reg<"ymm4", 4, [XMM4]>, DwarfRegAlias; -def YMM5: X86Reg<"ymm5", 5, [XMM5]>, DwarfRegAlias; -def YMM6: X86Reg<"ymm6", 6, [XMM6]>, DwarfRegAlias; -def YMM7: X86Reg<"ymm7", 7, [XMM7]>, DwarfRegAlias; -def YMM8: X86Reg<"ymm8", 8, [XMM8]>, DwarfRegAlias; -def YMM9: X86Reg<"ymm9", 9, [XMM9]>, DwarfRegAlias; -def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias; -def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias; -def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias; -def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias; -def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias; -def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias; + foreach Index = 0-31 in { + def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast("XMM"#Index)]>, + DwarfRegAlias("XMM"#Index)>; + } } +// ZMM Registers, used by AVX-512 instructions. +let SubRegIndices = [sub_ymm] in { + foreach Index = 0-31 in { + def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast("YMM"#Index)]>, + DwarfRegAlias("XMM"#Index)>; + } +} + + // Mask Registers, used by AVX-512 instructions. + def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; + def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; + def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; + def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; + def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; + def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; + def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; + def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; + class STRegister Enc, list A> : X86Reg { let Aliases = A; } @@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { let CopyCost = -1; // Don't allow copying of status registers. let isAllocatable = 0; } + +// AVX-512 vector/mask registers. +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512, + (sequence "ZMM%u", 0, 31)>; + +// Scalar AVX-512 floating point registers. +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; + +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; + +// Extended VR128 and VR256 for AVX-512 instructions +def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + 256, (sequence "YMM%u", 0, 31)>; + +def VK8 : RegisterClass<"X86", [v8i1], 8, (sequence "K%u", 0, 7)>; +def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)>; + +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)>; +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>; + diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 01a28d0fd1d..87932388703 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -42,7 +42,7 @@ enum Style { class X86Subtarget : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2 + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512 }; enum X863DNowEnum { @@ -169,6 +169,15 @@ protected: /// address generation (AG) time. bool LEAUsesAG; + /// Processor has AVX-512 PreFetch Instructions + bool HasPFI; + + /// Processor has AVX-512 Exponential and Reciprocal Instructions + bool HasERI; + + /// Processor has AVX-512 Conflict Detection Instructions + bool HasCDI; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -249,6 +258,7 @@ public: bool hasSSE42() const { return X86SSELevel >= SSE42; } bool hasAVX() const { return X86SSELevel >= AVX; } bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasAVX512() const { return X86SSELevel >= AVX512; } bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } @@ -282,6 +292,9 @@ public: bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } + bool hasCDI() const { return HasCDI; } + bool hasPFI() const { return HasPFI; } + bool hasERI() const { return HasERI; } bool isAtom() const { return X86ProcFamily == IntelAtom; } diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 0f77948c0ef..477f75afef2 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { } static bool isYmmReg(unsigned Reg) { - if (Reg >= X86::YMM0 && Reg <= X86::YMM15) - return true; + return (Reg >= X86::YMM0 && Reg <= X86::YMM31); +} - return false; +static bool isZmmReg(unsigned Reg) { + return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); } static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first)) + if (isYmmReg(I->first) || isZmmReg(I->first)) return true; return false; } static bool clobbersAllYmmRegs(const MachineOperand &MO) { - for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { + for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } + for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; }