llvm-6502/test/CodeGen/X86/unaligned-32-byte-memops.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL

; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
; because that is slower than two 16-byte loads. 
; Other AVX-capable chips don't have that problem.

define <8 x float> @load32bytes(<8 x float>* %Ap) {
  ; CHECK-LABEL: load32bytes

  ; SANDYB: vmovaps
  ; SANDYB: vinsertf128
  ; SANDYB: retq

  ; BTVER2: vmovups
  ; BTVER2: retq

  ; HASWELL: vmovups
  ; HASWELL: retq

  %A = load <8 x float>* %Ap, align 16
  ret <8 x float> %A
}

; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
; because that is slowerthan two 16-byte stores. 
; Other AVX-capable chips don't have that problem.

define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
  ; CHECK-LABEL: store32bytes

  ; SANDYB: vextractf128
  ; SANDYB: vmovaps
  ; SANDYB: retq

  ; BTVER2: vmovups
  ; BTVER2: retq

  ; HASWELL: vmovups
  ; HASWELL: retq

  store <8 x float> %A, <8 x float>* %P, align 16
  ret void
}
Add a feature flag for slow 32-byte unaligned memory accesses [x86]. This patch adds a feature flag to avoid unaligned 32-byte load/store AVX codegen for Sandy Bridge and Ivy Bridge. There is no functionality change intended for those chips. Previously, the absence of AVX2 was being used as a proxy to detect this feature. But that hindered codegen for AVX-enabled AMD chips such as btver2 that do not have the 32-byte unaligned access slowdown. Performance measurements are included in PR21541 ( http://llvm.org/bugs/show_bug.cgi?id=21541 ). Differential Revision: http://reviews.llvm.org/D6355 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222544 91177308-0d34-0410-b5e6-96231b3b80d8 2014-11-21 17:40:04 +00:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx \| FileCheck %s --check-prefix=SANDYB`
			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i \| FileCheck %s --check-prefix=SANDYB`
			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 \| FileCheck %s --check-prefix=BTVER2`
			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 \| FileCheck %s --check-prefix=HASWELL`

			`; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load`
			`; because that is slower than two 16-byte loads.`
			`; Other AVX-capable chips don't have that problem.`

			`define <8 x float> @load32bytes(<8 x float>* %Ap) {`
			`; CHECK-LABEL: load32bytes`

			`; SANDYB: vmovaps`
			`; SANDYB: vinsertf128`
			`; SANDYB: retq`

			`; BTVER2: vmovups`
			`; BTVER2: retq`

			`; HASWELL: vmovups`
			`; HASWELL: retq`

			`%A = load <8 x float>* %Ap, align 16`
			`ret <8 x float> %A`
			`}`

			`; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store`
			`; because that is slowerthan two 16-byte stores.`
			`; Other AVX-capable chips don't have that problem.`

			`define void @store32bytes(<8 x float> %A, <8 x float>* %P) {`
			`; CHECK-LABEL: store32bytes`

			`; SANDYB: vextractf128`
			`; SANDYB: vmovaps`
			`; SANDYB: retq`

			`; BTVER2: vmovups`
			`; BTVER2: retq`

			`; HASWELL: vmovups`
			`; HASWELL: retq`

			`store <8 x float> %A, <8 x float>* %P, align 16`
			`ret void`
			`}`