llvm-6502/test/Analysis/CostModel/X86/arith.ll

; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=SSE3
; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck %s --check-prefix=AVX2

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

define i32 @add(i32 %arg) {
  ;CHECK: cost of 1 {{.*}} add
  %A = add <4 x i32> undef, undef
  ;CHECK: cost of 4 {{.*}} add
  %B = add <8 x i32> undef, undef
  ;CHECK: cost of 1 {{.*}} add
  %C = add <2 x i64> undef, undef
  ;CHECK: cost of 4 {{.*}} add
  %D = add <4 x i64> undef, undef
  ;CHECK: cost of 8 {{.*}} add
  %E = add <8 x i64> undef, undef
  ;CHECK: cost of 0 {{.*}} ret
  ret i32 undef
}


define i32 @xor(i32 %arg) {
  ;CHECK: cost of 1 {{.*}} xor
  %A = xor <4 x i32> undef, undef
  ;CHECK: cost of 1 {{.*}} xor
  %B = xor <8 x i32> undef, undef
  ;CHECK: cost of 1 {{.*}} xor
  %C = xor <2 x i64> undef, undef
  ;CHECK: cost of 1 {{.*}} xor
  %D = xor <4 x i64> undef, undef
  ;CHECK: cost of 0 {{.*}} ret
  ret i32 undef
}

; CHECK: mul
define void @mul() {
  ; A <2 x i32> gets expanded to a <2 x i64> vector.
  ; A <2 x i64> vector multiply is implemented using
  ; 3 PMULUDQ and 2 PADDS and 4 shifts.
  ;CHECK: cost of 9 {{.*}} mul
  %A0 = mul <2 x i32> undef, undef
  ;CHECK: cost of 9 {{.*}} mul
  %A1 = mul <2 x i64> undef, undef
  ;CHECK: cost of 18 {{.*}} mul
  %A2 = mul <4 x i64> undef, undef
  ret void
}

; SSE3: sse3mull
define void @sse3mull() {
  ; SSE3: cost of 6 {{.*}} mul
  %A0 = mul <4 x i32> undef, undef
  ret void
  ; SSE3: avx2mull
}

; AVX2: avx2mull
define void @avx2mull() {
  ; AVX2: cost of 9 {{.*}} mul
  %A0 = mul <4 x i64> undef, undef
  ret void
  ; AVX2: fmul
}

; CHECK: fmul
define i32 @fmul(i32 %arg) {
  ;CHECK: cost of 2 {{.*}} fmul
  %A = fmul <4 x float> undef, undef
  ;CHECK: cost of 2 {{.*}} fmul
  %B = fmul <8 x float> undef, undef
  ret i32 undef
}

; AVX: shift
; AVX2: shift
define void @shift() {
  ; AVX: cost of 2 {{.*}} shl
  ; AVX2: cost of 1 {{.*}} shl
  %A0 = shl <4 x i32> undef, undef
  ; AVX: cost of 2 {{.*}} shl
  ; AVX2: cost of 1 {{.*}} shl
  %A1 = shl <2 x i64> undef, undef

  ; AVX: cost of 2 {{.*}} lshr
  ; AVX2: cost of 1 {{.*}} lshr
  %B0 = lshr <4 x i32> undef, undef
  ; AVX: cost of 2 {{.*}} lshr
  ; AVX2: cost of 1 {{.*}} lshr
  %B1 = lshr <2 x i64> undef, undef

  ; AVX: cost of 2 {{.*}} ashr
  ; AVX2: cost of 1 {{.*}} ashr
  %C0 = ashr <4 x i32> undef, undef
  ; AVX: cost of 6 {{.*}} ashr
  ; AVX2: cost of 20 {{.*}} ashr
  %C1 = ashr <2 x i64> undef, undef

  ret void
}

; AVX: avx2shift
; AVX2: avx2shift
define void @avx2shift() {
  ; AVX: cost of 2 {{.*}} shl
  ; AVX2: cost of 1 {{.*}} shl
  %A0 = shl <8 x i32> undef, undef
  ; AVX: cost of 2 {{.*}} shl
  ; AVX2: cost of 1 {{.*}} shl
  %A1 = shl <4 x i64> undef, undef

  ; AVX: cost of 2 {{.*}} lshr
  ; AVX2: cost of 1 {{.*}} lshr
  %B0 = lshr <8 x i32> undef, undef
  ; AVX: cost of 2 {{.*}} lshr
  ; AVX2: cost of 1 {{.*}} lshr
  %B1 = lshr <4 x i64> undef, undef

  ; AVX: cost of 2 {{.*}} ashr
  ; AVX2: cost of 1 {{.*}} ashr
  %C0 = ashr <8 x i32> undef, undef
  ; AVX: cost of 12 {{.*}} ashr
  ; AVX2: cost of 40 {{.*}} ashr
  %C1 = ashr <4 x i64> undef, undef

  ret void
}
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00			`; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx \| FileCheck %s`
X86 cost model: Adjust cost for custom lowered vector multiplies This matters for example in following matrix multiply: int mmult(int rows, int cols, int m1, int m2, int m3) { int i, j, k, val; for (i=0; i<rows; i++) { for (j=0; j<cols; j++) { val = 0; for (k=0; k<cols; k++) { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } return(m3); } Taken from the test-suite benchmark Shootout. We estimate the cost of the multiply to be 2 while we generate 9 instructions for it and end up being quite a bit slower than the scalar version (48% on my machine). Also, properly differentiate between avx1 and avx2. On avx-1 we still split the vector into 2 128bits and handle the subvector muls like above with 9 instructions. Only on avx-2 will we have a cost of 9 for v4i64. I changed the test case in test/Transforms/LoopVectorize/X86/avx1.ll to use an add instead of a mul because with a mul we now no longer vectorize. I did verify that the mul would be indeed more expensive when vectorized with 3 kernels: for (i ...) r += a[i] * 3; for (i ...) m1[i] = m1[i] * 3; // This matches the test case in avx1.ll and a matrix multiply. In each case the vectorized version was considerably slower. radar://13304919 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176403 91177308-0d34-0410-b5e6-96231b3b80d8 2013-03-02 04:02:52 +00:00			`; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 \| FileCheck %s --check-prefix=SSE3`
			`; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 \| FileCheck %s --check-prefix=AVX2`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00
			`target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"`
			`target triple = "x86_64-apple-macosx10.8.0"`

			`define i32 @add(i32 %arg) {`
			`;CHECK: cost of 1 {{.*}} add`
			`%A = add <4 x i32> undef, undef`
			`;CHECK: cost of 4 {{.*}} add`
			`%B = add <8 x i32> undef, undef`
			`;CHECK: cost of 1 {{.*}} add`
			`%C = add <2 x i64> undef, undef`
			`;CHECK: cost of 4 {{.*}} add`
			`%D = add <4 x i64> undef, undef`
Implement the cost of abnormal x86 instruction lowering as a table. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167395 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-05 19:32:46 +00:00			`;CHECK: cost of 8 {{.*}} add`
			`%E = add <8 x i64> undef, undef`
Cost Model: change the default cost of control flow instructions (br / ret / ...) to zero. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169423 91177308-0d34-0410-b5e6-96231b3b80d8 2012-12-05 21:21:26 +00:00			`;CHECK: cost of 0 {{.*}} ret`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00			`ret i32 undef`
			`}`


			`define i32 @xor(i32 %arg) {`
			`;CHECK: cost of 1 {{.*}} xor`
			`%A = xor <4 x i32> undef, undef`
			`;CHECK: cost of 1 {{.*}} xor`
			`%B = xor <8 x i32> undef, undef`
			`;CHECK: cost of 1 {{.*}} xor`
			`%C = xor <2 x i64> undef, undef`
			`;CHECK: cost of 1 {{.*}} xor`
			`%D = xor <4 x i64> undef, undef`
Cost Model: change the default cost of control flow instructions (br / ret / ...) to zero. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169423 91177308-0d34-0410-b5e6-96231b3b80d8 2012-12-05 21:21:26 +00:00			`;CHECK: cost of 0 {{.*}} ret`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00			`ret i32 undef`
			`}`

X86 cost model: Adjust cost for custom lowered vector multiplies This matters for example in following matrix multiply: int mmult(int rows, int cols, int m1, int m2, int m3) { int i, j, k, val; for (i=0; i<rows; i++) { for (j=0; j<cols; j++) { val = 0; for (k=0; k<cols; k++) { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } return(m3); } Taken from the test-suite benchmark Shootout. We estimate the cost of the multiply to be 2 while we generate 9 instructions for it and end up being quite a bit slower than the scalar version (48% on my machine). Also, properly differentiate between avx1 and avx2. On avx-1 we still split the vector into 2 128bits and handle the subvector muls like above with 9 instructions. Only on avx-2 will we have a cost of 9 for v4i64. I changed the test case in test/Transforms/LoopVectorize/X86/avx1.ll to use an add instead of a mul because with a mul we now no longer vectorize. I did verify that the mul would be indeed more expensive when vectorized with 3 kernels: for (i ...) r += a[i] * 3; for (i ...) m1[i] = m1[i] * 3; // This matches the test case in avx1.ll and a matrix multiply. In each case the vectorized version was considerably slower. radar://13304919 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176403 91177308-0d34-0410-b5e6-96231b3b80d8 2013-03-02 04:02:52 +00:00			`; CHECK: mul`
			`define void @mul() {`
			`; A <2 x i32> gets expanded to a <2 x i64> vector.`
			`; A <2 x i64> vector multiply is implemented using`
			`; 3 PMULUDQ and 2 PADDS and 4 shifts.`
			`;CHECK: cost of 9 {{.*}} mul`
			`%A0 = mul <2 x i32> undef, undef`
			`;CHECK: cost of 9 {{.*}} mul`
			`%A1 = mul <2 x i64> undef, undef`
			`;CHECK: cost of 18 {{.*}} mul`
			`%A2 = mul <4 x i64> undef, undef`
			`ret void`
			`}`

			`; SSE3: sse3mull`
			`define void @sse3mull() {`
			`; SSE3: cost of 6 {{.*}} mul`
			`%A0 = mul <4 x i32> undef, undef`
			`ret void`
			`; SSE3: avx2mull`
			`}`

			`; AVX2: avx2mull`
			`define void @avx2mull() {`
			`; AVX2: cost of 9 {{.*}} mul`
			`%A0 = mul <4 x i64> undef, undef`
			`ret void`
			`; AVX2: fmul`
			`}`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00
X86 cost model: Adjust cost for custom lowered vector multiplies This matters for example in following matrix multiply: int mmult(int rows, int cols, int m1, int m2, int m3) { int i, j, k, val; for (i=0; i<rows; i++) { for (j=0; j<cols; j++) { val = 0; for (k=0; k<cols; k++) { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } return(m3); } Taken from the test-suite benchmark Shootout. We estimate the cost of the multiply to be 2 while we generate 9 instructions for it and end up being quite a bit slower than the scalar version (48% on my machine). Also, properly differentiate between avx1 and avx2. On avx-1 we still split the vector into 2 128bits and handle the subvector muls like above with 9 instructions. Only on avx-2 will we have a cost of 9 for v4i64. I changed the test case in test/Transforms/LoopVectorize/X86/avx1.ll to use an add instead of a mul because with a mul we now no longer vectorize. I did verify that the mul would be indeed more expensive when vectorized with 3 kernels: for (i ...) r += a[i] * 3; for (i ...) m1[i] = m1[i] * 3; // This matches the test case in avx1.ll and a matrix multiply. In each case the vectorized version was considerably slower. radar://13304919 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176403 91177308-0d34-0410-b5e6-96231b3b80d8 2013-03-02 04:02:52 +00:00			`; CHECK: fmul`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00			`define i32 @fmul(i32 %arg) {`
CostModel: increase the default cost of supported floating point operations from 1 to two. Fixed a few tests that changes because now the cost of one insert + a vector operation on two doubles is lower than two scalar operations on doubles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179413 91177308-0d34-0410-b5e6-96231b3b80d8 2013-04-12 21:15:03 +00:00			`;CHECK: cost of 2 {{.*}} fmul`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00			`%A = fmul <4 x float> undef, undef`
CostModel: increase the default cost of supported floating point operations from 1 to two. Fixed a few tests that changes because now the cost of one insert + a vector operation on two doubles is lower than two scalar operations on doubles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179413 91177308-0d34-0410-b5e6-96231b3b80d8 2013-04-12 21:15:03 +00:00			`;CHECK: cost of 2 {{.*}} fmul`
X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 2012-11-03 00:39:56 +00:00			`%B = fmul <8 x float> undef, undef`
			`ret i32 undef`
			`}`
Correct cost model for vector shift on AVX2 - After moving logic recognizing vector shift with scalar amount from DAG combining into DAG lowering, we declare to customize all vector shifts even vector shift on AVX is legal. As a result, the cost model needs special tuning to identify these legal cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177586 91177308-0d34-0410-b5e6-96231b3b80d8 2013-03-20 22:01:10 +00:00
			`; AVX: shift`
			`; AVX2: shift`
			`define void @shift() {`
			`; AVX: cost of 2 {{.*}} shl`
			`; AVX2: cost of 1 {{.*}} shl`
			`%A0 = shl <4 x i32> undef, undef`
			`; AVX: cost of 2 {{.*}} shl`
			`; AVX2: cost of 1 {{.*}} shl`
			`%A1 = shl <2 x i64> undef, undef`

			`; AVX: cost of 2 {{.*}} lshr`
			`; AVX2: cost of 1 {{.*}} lshr`
			`%B0 = lshr <4 x i32> undef, undef`
			`; AVX: cost of 2 {{.*}} lshr`
			`; AVX2: cost of 1 {{.*}} lshr`
			`%B1 = lshr <2 x i64> undef, undef`

			`; AVX: cost of 2 {{.*}} ashr`
			`; AVX2: cost of 1 {{.*}} ashr`
			`%C0 = ashr <4 x i32> undef, undef`
			`; AVX: cost of 6 {{.*}} ashr`
X86 cost model: Vector shifts are expensive in most cases The default logic does not correctly identify costs of casts because they are marked as custom on x86. For some cases, where the shift amount is a scalar we would be able to generate better code. Unfortunately, when this is the case the value (the splat) will get hoisted out of the loop, thereby making it invisible to ISel. radar://13130673 radar://13537826 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178703 91177308-0d34-0410-b5e6-96231b3b80d8 2013-04-03 21:46:05 +00:00			`; AVX2: cost of 20 {{.*}} ashr`
Correct cost model for vector shift on AVX2 - After moving logic recognizing vector shift with scalar amount from DAG combining into DAG lowering, we declare to customize all vector shifts even vector shift on AVX is legal. As a result, the cost model needs special tuning to identify these legal cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177586 91177308-0d34-0410-b5e6-96231b3b80d8 2013-03-20 22:01:10 +00:00			`%C1 = ashr <2 x i64> undef, undef`

			`ret void`
			`}`

			`; AVX: avx2shift`
			`; AVX2: avx2shift`
			`define void @avx2shift() {`
			`; AVX: cost of 2 {{.*}} shl`
			`; AVX2: cost of 1 {{.*}} shl`
			`%A0 = shl <8 x i32> undef, undef`
			`; AVX: cost of 2 {{.*}} shl`
			`; AVX2: cost of 1 {{.*}} shl`
			`%A1 = shl <4 x i64> undef, undef`

			`; AVX: cost of 2 {{.*}} lshr`
			`; AVX2: cost of 1 {{.*}} lshr`
			`%B0 = lshr <8 x i32> undef, undef`
			`; AVX: cost of 2 {{.*}} lshr`
			`; AVX2: cost of 1 {{.*}} lshr`
			`%B1 = lshr <4 x i64> undef, undef`

			`; AVX: cost of 2 {{.*}} ashr`
			`; AVX2: cost of 1 {{.*}} ashr`
			`%C0 = ashr <8 x i32> undef, undef`
			`; AVX: cost of 12 {{.*}} ashr`
X86 cost model: Vector shifts are expensive in most cases The default logic does not correctly identify costs of casts because they are marked as custom on x86. For some cases, where the shift amount is a scalar we would be able to generate better code. Unfortunately, when this is the case the value (the splat) will get hoisted out of the loop, thereby making it invisible to ISel. radar://13130673 radar://13537826 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178703 91177308-0d34-0410-b5e6-96231b3b80d8 2013-04-03 21:46:05 +00:00			`; AVX2: cost of 40 {{.*}} ashr`
Correct cost model for vector shift on AVX2 - After moving logic recognizing vector shift with scalar amount from DAG combining into DAG lowering, we declare to customize all vector shifts even vector shift on AVX is legal. As a result, the cost model needs special tuning to identify these legal cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177586 91177308-0d34-0410-b5e6-96231b3b80d8 2013-03-20 22:01:10 +00:00			`%C1 = ashr <4 x i64> undef, undef`

			`ret void`
			`}`