diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index ab3319afe93..30b3b2876b8 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -132,9 +132,9 @@ def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", "Enable XOP instructions", [FeatureFMA4]>; -def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", - "HasVectorUAMem", "true", - "Allow unaligned memory operands on vector/SIMD instructions">; +def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", + "HasSSEUnalignedMem", "true", + "Allow unaligned memory operands with SSE instructions">; def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", "Enable AES instructions", [FeatureSSE2]>; @@ -309,7 +309,6 @@ class SandyBridgeProc : ProcessorModel : ProcessorModel : ProcessorModel : ProcessorModel : ProcessorModel; + FeatureSlowIncDec]>; def : KnightsLandingProc<"knl">; // FIXME: define SKX model @@ -399,7 +395,7 @@ class SkylakeProc : ProcessorModel; + FeatureSlowIncDec, FeatureSGX]>; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 85768aeb4e4..e5de404ed21 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -433,7 +433,7 @@ def alignedloadv8i64 : PatFrag<(ops node:$ptr), // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() + return Subtarget->hasSSEUnalignedMem() || cast(N)->getAlignment() >= 16; }]>; diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 01889e887c2..e90da0f98a6 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -265,7 +265,7 @@ void X86Subtarget::initializeEnvironment() { IsSHLDSlow = false; IsUAMemFast = false; IsUAMem32Slow = false; - HasVectorUAMem = false; + HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; HasSlowDivide32 = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 27dec659643..417f1332c54 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -162,9 +162,9 @@ protected: /// True if unaligned 32-byte memory accesses are slow. bool IsUAMem32Slow; - /// HasVectorUAMem - True if SIMD operations can have unaligned memory - /// operands. This may require setting a feature bit in the processor. - bool HasVectorUAMem; + /// True if SSE operations can have unaligned memory operands. + /// This may require setting a configuration bit in the processor. + bool HasSSEUnalignedMem; /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; /// this is true for most x86-64 chips, but not the first AMD chips. @@ -375,7 +375,7 @@ public: bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } - bool hasVectorUAMem() const { return HasVectorUAMem; } + bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } bool hasSlowDivide32() const { return HasSlowDivide32; } diff --git a/test/CodeGen/X86/fold-vex.ll b/test/CodeGen/X86/fold-vex.ll index a0c5e22b1c0..5a8b1d8cbfd 100644 --- a/test/CodeGen/X86/fold-vex.ll +++ b/test/CodeGen/X86/fold-vex.ll @@ -1,12 +1,18 @@ ; Use CPU parameters to ensure that a CPU-specific attribute is not overriding the AVX definition. -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=-avx | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -mattr=-avx | FileCheck %s --check-prefix=SSE ; No need to load unaligned operand from memory using an explicit instruction with AVX. ; The operand should be folded into the AND instr. +; With SSE, folding memory operands into math/logic ops requires 16-byte alignment +; unless specially configured on some CPUs such as AMD Family 10H. + define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind { %in0 = load <4 x i32>* %p0, align 2 %a = and <4 x i32> %in0, %in1 @@ -16,5 +22,10 @@ define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind { ; CHECK-NOT: vmovups ; CHECK: vandps (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: ret + +; SSE-LABEL: @test1 +; SSE: movups (%rdi), %xmm1 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: ret } diff --git a/test/CodeGen/X86/2010-01-07-UAMemFeature.ll b/test/CodeGen/X86/sse-unaligned-mem-feature.ll similarity index 74% rename from test/CodeGen/X86/2010-01-07-UAMemFeature.ll rename to test/CodeGen/X86/sse-unaligned-mem-feature.ll index bb24adb4181..15f91ee04ea 100644 --- a/test/CodeGen/X86/2010-01-07-UAMemFeature.ll +++ b/test/CodeGen/X86/sse-unaligned-mem-feature.ll @@ -1,5 +1,4 @@ -; RUN: llc -mcpu=yonah -mattr=vector-unaligned-mem -march=x86 < %s | FileCheck %s -; CHECK: addps ( +; RUN: llc -mcpu=yonah -mattr=sse-unaligned-mem -march=x86 < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" @@ -8,4 +7,7 @@ define <4 x float> @foo(<4 x float>* %P, <4 x float> %In) nounwind { %A = load <4 x float>* %P, align 4 %B = fadd <4 x float> %A, %In ret <4 x float> %B + +; CHECK-LABEL: @foo +; CHECK: addps ( }