mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-07-18 10:24:45 +00:00
Add a feature flag for slow 32-byte unaligned memory accesses [x86].
This patch adds a feature flag to avoid unaligned 32-byte load/store AVX codegen for Sandy Bridge and Ivy Bridge. There is no functionality change intended for those chips. Previously, the absence of AVX2 was being used as a proxy to detect this feature. But that hindered codegen for AVX-enabled AMD chips such as btver2 that do not have the 32-byte unaligned access slowdown. Performance measurements are included in PR21541 ( http://llvm.org/bugs/show_bug.cgi?id=21541 ). Differential Revision: http://reviews.llvm.org/D6355 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222544 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
@@ -82,6 +82,9 @@ def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
|
|||||||
def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
|
def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
|
||||||
"IsUAMemFast", "true",
|
"IsUAMemFast", "true",
|
||||||
"Fast unaligned memory access">;
|
"Fast unaligned memory access">;
|
||||||
|
def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
|
||||||
|
"IsUAMem32Slow", "true",
|
||||||
|
"Slow unaligned 32-byte memory access">;
|
||||||
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
|
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
|
||||||
"Support SSE 4a instructions",
|
"Support SSE 4a instructions",
|
||||||
[FeatureSSE3]>;
|
[FeatureSSE3]>;
|
||||||
@@ -271,12 +274,14 @@ def : ProcessorModel<"westmere", SandyBridgeModel,
|
|||||||
// rather than a superset.
|
// rather than a superset.
|
||||||
def : ProcessorModel<"corei7-avx", SandyBridgeModel,
|
def : ProcessorModel<"corei7-avx", SandyBridgeModel,
|
||||||
[FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
|
[FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
|
||||||
FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
|
FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES,
|
||||||
|
FeaturePCLMUL]>;
|
||||||
// Ivy Bridge
|
// Ivy Bridge
|
||||||
def : ProcessorModel<"core-avx-i", SandyBridgeModel,
|
def : ProcessorModel<"core-avx-i", SandyBridgeModel,
|
||||||
[FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
|
[FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
|
||||||
FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
|
FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES,
|
||||||
FeatureF16C, FeatureFSGSBase]>;
|
FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
|
||||||
|
FeatureFSGSBase]>;
|
||||||
|
|
||||||
// Haswell
|
// Haswell
|
||||||
def : ProcessorModel<"core-avx2", HaswellModel,
|
def : ProcessorModel<"core-avx2", HaswellModel,
|
||||||
|
@@ -24376,11 +24376,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
|
|||||||
SDLoc dl(Ld);
|
SDLoc dl(Ld);
|
||||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||||
|
|
||||||
// On Sandybridge unaligned 256bit loads are inefficient.
|
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
|
||||||
|
// into two 16-byte operations.
|
||||||
ISD::LoadExtType Ext = Ld->getExtensionType();
|
ISD::LoadExtType Ext = Ld->getExtensionType();
|
||||||
unsigned Alignment = Ld->getAlignment();
|
unsigned Alignment = Ld->getAlignment();
|
||||||
bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
|
bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
|
||||||
if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
|
if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
|
||||||
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
|
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
|
||||||
unsigned NumElems = RegVT.getVectorNumElements();
|
unsigned NumElems = RegVT.getVectorNumElements();
|
||||||
if (NumElems < 2)
|
if (NumElems < 2)
|
||||||
@@ -24423,13 +24424,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
|
|||||||
SDValue StoredVal = St->getOperand(1);
|
SDValue StoredVal = St->getOperand(1);
|
||||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||||
|
|
||||||
// If we are saving a concatenation of two XMM registers, perform two stores.
|
// If we are saving a concatenation of two XMM registers and 32-byte stores
|
||||||
// On Sandy Bridge, 256-bit memory operations are executed by two
|
// are slow, such as on Sandy Bridge, perform two 16-byte stores.
|
||||||
// 128-bit ports. However, on Haswell it is better to issue a single 256-bit
|
|
||||||
// memory operation.
|
|
||||||
unsigned Alignment = St->getAlignment();
|
unsigned Alignment = St->getAlignment();
|
||||||
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
|
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
|
||||||
if (VT.is256BitVector() && !Subtarget->hasInt256() &&
|
if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
|
||||||
StVT == VT && !IsAligned) {
|
StVT == VT && !IsAligned) {
|
||||||
unsigned NumElems = VT.getVectorNumElements();
|
unsigned NumElems = VT.getVectorNumElements();
|
||||||
if (NumElems < 2)
|
if (NumElems < 2)
|
||||||
|
@@ -264,6 +264,7 @@ void X86Subtarget::initializeEnvironment() {
|
|||||||
IsBTMemSlow = false;
|
IsBTMemSlow = false;
|
||||||
IsSHLDSlow = false;
|
IsSHLDSlow = false;
|
||||||
IsUAMemFast = false;
|
IsUAMemFast = false;
|
||||||
|
IsUAMem32Slow = false;
|
||||||
HasVectorUAMem = false;
|
HasVectorUAMem = false;
|
||||||
HasCmpxchg16b = false;
|
HasCmpxchg16b = false;
|
||||||
UseLeaForSP = false;
|
UseLeaForSP = false;
|
||||||
|
@@ -159,6 +159,9 @@ protected:
|
|||||||
/// IsUAMemFast - True if unaligned memory access is fast.
|
/// IsUAMemFast - True if unaligned memory access is fast.
|
||||||
bool IsUAMemFast;
|
bool IsUAMemFast;
|
||||||
|
|
||||||
|
/// True if unaligned 32-byte memory accesses are slow.
|
||||||
|
bool IsUAMem32Slow;
|
||||||
|
|
||||||
/// HasVectorUAMem - True if SIMD operations can have unaligned memory
|
/// HasVectorUAMem - True if SIMD operations can have unaligned memory
|
||||||
/// operands. This may require setting a feature bit in the processor.
|
/// operands. This may require setting a feature bit in the processor.
|
||||||
bool HasVectorUAMem;
|
bool HasVectorUAMem;
|
||||||
@@ -374,6 +377,7 @@ public:
|
|||||||
bool isBTMemSlow() const { return IsBTMemSlow; }
|
bool isBTMemSlow() const { return IsBTMemSlow; }
|
||||||
bool isSHLDSlow() const { return IsSHLDSlow; }
|
bool isSHLDSlow() const { return IsSHLDSlow; }
|
||||||
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
|
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
|
||||||
|
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
|
||||||
bool hasVectorUAMem() const { return HasVectorUAMem; }
|
bool hasVectorUAMem() const { return HasVectorUAMem; }
|
||||||
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
|
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
|
||||||
bool useLeaForSP() const { return UseLeaForSP; }
|
bool useLeaForSP() const { return UseLeaForSP; }
|
||||||
|
@@ -1,13 +0,0 @@
|
|||||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
|
|
||||||
|
|
||||||
define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
|
|
||||||
entry:
|
|
||||||
; CHECK: vmovaps
|
|
||||||
; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
|
|
||||||
; CHECK: vmovups
|
|
||||||
%A = load <4 x i32>* %Ap
|
|
||||||
%B = load <4 x i32>* %Bp
|
|
||||||
%Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
||||||
store <8 x i32> %Z, <8 x i32>* %P, align 16
|
|
||||||
ret void
|
|
||||||
}
|
|
46
test/CodeGen/X86/unaligned-32-byte-memops.ll
Normal file
46
test/CodeGen/X86/unaligned-32-byte-memops.ll
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
|
||||||
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
|
||||||
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
|
||||||
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
|
||||||
|
|
||||||
|
; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
|
||||||
|
; because that is slower than two 16-byte loads.
|
||||||
|
; Other AVX-capable chips don't have that problem.
|
||||||
|
|
||||||
|
define <8 x float> @load32bytes(<8 x float>* %Ap) {
|
||||||
|
; CHECK-LABEL: load32bytes
|
||||||
|
|
||||||
|
; SANDYB: vmovaps
|
||||||
|
; SANDYB: vinsertf128
|
||||||
|
; SANDYB: retq
|
||||||
|
|
||||||
|
; BTVER2: vmovups
|
||||||
|
; BTVER2: retq
|
||||||
|
|
||||||
|
; HASWELL: vmovups
|
||||||
|
; HASWELL: retq
|
||||||
|
|
||||||
|
%A = load <8 x float>* %Ap, align 16
|
||||||
|
ret <8 x float> %A
|
||||||
|
}
|
||||||
|
|
||||||
|
; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
|
||||||
|
; because that is slowerthan two 16-byte stores.
|
||||||
|
; Other AVX-capable chips don't have that problem.
|
||||||
|
|
||||||
|
define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
|
||||||
|
; CHECK-LABEL: store32bytes
|
||||||
|
|
||||||
|
; SANDYB: vextractf128
|
||||||
|
; SANDYB: vmovaps
|
||||||
|
; SANDYB: retq
|
||||||
|
|
||||||
|
; BTVER2: vmovups
|
||||||
|
; BTVER2: retq
|
||||||
|
|
||||||
|
; HASWELL: vmovups
|
||||||
|
; HASWELL: retq
|
||||||
|
|
||||||
|
store <8 x float> %A, <8 x float>* %P, align 16
|
||||||
|
ret void
|
||||||
|
}
|
Reference in New Issue
Block a user