mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-11-24 08:18:33 +00:00
This patch ensures that SHL/SRL/SRA shifts for i8 and i16 vectors avoid scalarization. It builds on the existing i8 SHL vectorized implementation of moving the shift bits up to the sign bit position and separating the 4, 2 & 1 bit shifts with several improvements: 1 - SSE41 targets can use (v)pblendvb directly with the sign bit instead of performing a comparison to feed into a VSELECT node. 2 - pre-SSE41 targets were masking + comparing with an 0x80 constant - we avoid this by using the fact that a set sign bit means a negative integer which can be compared against zero to then feed into VSELECT, avoiding the need for a constant mask (zero generation is much cheaper). 3 - SRA i8 needs to be unpacked to the upper byte of a i16 so that the i16 psraw instruction can be correctly used for sign extension - we have to do more work than for SHL/SRL but perf tests indicate that this is still beneficial. The i16 implementation is similar but simpler than for i8 - we have to do 8, 4, 2 & 1 bit shifts but less shift masking is involved. SSE41 use of (v)pblendvb requires that the i16 shift amount is splatted to both bytes however. Tested on SSE2, SSE41 and AVX machines. Differential Revision: http://reviews.llvm.org/D9474 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239509 91177308-0d34-0410-b5e6-96231b3b80d8
530 lines
15 KiB
LLVM
530 lines
15 KiB
LLVM
; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
|
|
; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
|
|
|
|
%shifttype = type <2 x i16>
|
|
define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
|
|
entry:
|
|
; SSE2: shift2i16
|
|
; SSE2: cost of 20 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i16
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype %a , %b
|
|
ret %shifttype %0
|
|
}
|
|
|
|
%shifttype4i16 = type <4 x i16>
|
|
define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
|
|
entry:
|
|
; SSE2: shift4i16
|
|
; SSE2: cost of 40 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i16
|
|
; SSE2-CODEGEN: shrl %cl
|
|
|
|
%0 = lshr %shifttype4i16 %a , %b
|
|
ret %shifttype4i16 %0
|
|
}
|
|
|
|
%shifttype8i16 = type <8 x i16>
|
|
define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
|
|
entry:
|
|
; SSE2: shift8i16
|
|
; SSE2: cost of 32 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i16
|
|
; SSE2-CODEGEN: psrlw
|
|
|
|
%0 = lshr %shifttype8i16 %a , %b
|
|
ret %shifttype8i16 %0
|
|
}
|
|
|
|
%shifttype16i16 = type <16 x i16>
|
|
define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
|
|
entry:
|
|
; SSE2: shift16i16
|
|
; SSE2: cost of 64 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i16
|
|
; SSE2-CODEGEN: psrlw
|
|
|
|
%0 = lshr %shifttype16i16 %a , %b
|
|
ret %shifttype16i16 %0
|
|
}
|
|
|
|
%shifttype32i16 = type <32 x i16>
|
|
define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
|
|
entry:
|
|
; SSE2: shift32i16
|
|
; SSE2: cost of 128 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i16
|
|
; SSE2-CODEGEN: psrlw
|
|
|
|
%0 = lshr %shifttype32i16 %a , %b
|
|
ret %shifttype32i16 %0
|
|
}
|
|
|
|
%shifttype2i32 = type <2 x i32>
|
|
define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
|
|
entry:
|
|
; SSE2: shift2i32
|
|
; SSE2: cost of 20 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i32
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype2i32 %a , %b
|
|
ret %shifttype2i32 %0
|
|
}
|
|
|
|
%shifttype4i32 = type <4 x i32>
|
|
define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
|
|
entry:
|
|
; SSE2: shift4i32
|
|
; SSE2: cost of 40 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i32
|
|
; SSE2-CODEGEN: shrl %cl
|
|
|
|
%0 = lshr %shifttype4i32 %a , %b
|
|
ret %shifttype4i32 %0
|
|
}
|
|
|
|
%shifttype8i32 = type <8 x i32>
|
|
define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
|
|
entry:
|
|
; SSE2: shift8i32
|
|
; SSE2: cost of 80 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i32
|
|
; SSE2-CODEGEN: shrl %cl
|
|
|
|
%0 = lshr %shifttype8i32 %a , %b
|
|
ret %shifttype8i32 %0
|
|
}
|
|
|
|
%shifttype16i32 = type <16 x i32>
|
|
define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
|
|
entry:
|
|
; SSE2: shift16i32
|
|
; SSE2: cost of 160 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i32
|
|
; SSE2-CODEGEN: shrl %cl
|
|
|
|
%0 = lshr %shifttype16i32 %a , %b
|
|
ret %shifttype16i32 %0
|
|
}
|
|
|
|
%shifttype32i32 = type <32 x i32>
|
|
define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
|
|
entry:
|
|
; SSE2: shift32i32
|
|
; SSE2: cost of 320 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i32
|
|
; SSE2-CODEGEN: shrl %cl
|
|
|
|
%0 = lshr %shifttype32i32 %a , %b
|
|
ret %shifttype32i32 %0
|
|
}
|
|
|
|
%shifttype2i64 = type <2 x i64>
|
|
define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
|
|
entry:
|
|
; SSE2: shift2i64
|
|
; SSE2: cost of 20 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i64
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype2i64 %a , %b
|
|
ret %shifttype2i64 %0
|
|
}
|
|
|
|
%shifttype4i64 = type <4 x i64>
|
|
define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
|
|
entry:
|
|
; SSE2: shift4i64
|
|
; SSE2: cost of 40 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i64
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype4i64 %a , %b
|
|
ret %shifttype4i64 %0
|
|
}
|
|
|
|
%shifttype8i64 = type <8 x i64>
|
|
define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
|
|
entry:
|
|
; SSE2: shift8i64
|
|
; SSE2: cost of 80 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i64
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype8i64 %a , %b
|
|
ret %shifttype8i64 %0
|
|
}
|
|
|
|
%shifttype16i64 = type <16 x i64>
|
|
define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
|
|
entry:
|
|
; SSE2: shift16i64
|
|
; SSE2: cost of 160 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i64
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype16i64 %a , %b
|
|
ret %shifttype16i64 %0
|
|
}
|
|
|
|
%shifttype32i64 = type <32 x i64>
|
|
define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
|
|
entry:
|
|
; SSE2: shift32i64
|
|
; SSE2: cost of 320 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i64
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype32i64 %a , %b
|
|
ret %shifttype32i64 %0
|
|
}
|
|
|
|
%shifttype2i8 = type <2 x i8>
|
|
define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
|
|
entry:
|
|
; SSE2: shift2i8
|
|
; SSE2: cost of 20 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i8
|
|
; SSE2-CODEGEN: psrlq
|
|
|
|
%0 = lshr %shifttype2i8 %a , %b
|
|
ret %shifttype2i8 %0
|
|
}
|
|
|
|
%shifttype4i8 = type <4 x i8>
|
|
define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
|
|
entry:
|
|
; SSE2: shift4i8
|
|
; SSE2: cost of 40 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i8
|
|
; SSE2-CODEGEN: shrl %cl
|
|
|
|
%0 = lshr %shifttype4i8 %a , %b
|
|
ret %shifttype4i8 %0
|
|
}
|
|
|
|
%shifttype8i8 = type <8 x i8>
|
|
define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
|
|
entry:
|
|
; SSE2: shift8i8
|
|
; SSE2: cost of 32 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i8
|
|
; SSE2-CODEGEN: psrlw
|
|
|
|
%0 = lshr %shifttype8i8 %a , %b
|
|
ret %shifttype8i8 %0
|
|
}
|
|
|
|
%shifttype16i8 = type <16 x i8>
|
|
define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
|
|
entry:
|
|
; SSE2: shift16i8
|
|
; SSE2: cost of 26 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i8
|
|
; SSE2-CODEGEN: psrlw
|
|
|
|
%0 = lshr %shifttype16i8 %a , %b
|
|
ret %shifttype16i8 %0
|
|
}
|
|
|
|
%shifttype32i8 = type <32 x i8>
|
|
define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
|
|
entry:
|
|
; SSE2: shift32i8
|
|
; SSE2: cost of 52 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i8
|
|
; SSE2-CODEGEN: psrlw
|
|
|
|
%0 = lshr %shifttype32i8 %a , %b
|
|
ret %shifttype32i8 %0
|
|
}
|
|
|
|
; Test shift by a constant vector.
|
|
|
|
%shifttypec = type <2 x i16>
|
|
define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
|
|
entry:
|
|
; SSE2: shift2i16const
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i16const
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec %a , <i16 3, i16 3>
|
|
ret %shifttypec %0
|
|
}
|
|
|
|
%shifttypec4i16 = type <4 x i16>
|
|
define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
|
|
entry:
|
|
; SSE2: shift4i16const
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i16const
|
|
; SSE2-CODEGEN: psrld $3
|
|
|
|
%0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
|
|
ret %shifttypec4i16 %0
|
|
}
|
|
|
|
%shifttypec8i16 = type <8 x i16>
|
|
define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
|
|
entry:
|
|
; SSE2: shift8i16const
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i16const
|
|
; SSE2-CODEGEN: psrlw $3
|
|
|
|
%0 = lshr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3>
|
|
ret %shifttypec8i16 %0
|
|
}
|
|
|
|
%shifttypec16i16 = type <16 x i16>
|
|
define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
|
|
%shifttypec16i16 %b) {
|
|
entry:
|
|
; SSE2: shift16i16const
|
|
; SSE2: cost of 2 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i16const
|
|
; SSE2-CODEGEN: psrlw $3
|
|
|
|
%0 = lshr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3>
|
|
ret %shifttypec16i16 %0
|
|
}
|
|
|
|
%shifttypec32i16 = type <32 x i16>
|
|
define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
|
|
%shifttypec32i16 %b) {
|
|
entry:
|
|
; SSE2: shift32i16const
|
|
; SSE2: cost of 4 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i16const
|
|
; SSE2-CODEGEN: psrlw $3
|
|
|
|
%0 = lshr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3,
|
|
i16 3, i16 3, i16 3, i16 3>
|
|
ret %shifttypec32i16 %0
|
|
}
|
|
|
|
%shifttypec2i32 = type <2 x i32>
|
|
define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
|
|
entry:
|
|
; SSE2: shift2i32c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i32c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec2i32 %a , <i32 3, i32 3>
|
|
ret %shifttypec2i32 %0
|
|
}
|
|
|
|
%shifttypec4i32 = type <4 x i32>
|
|
define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
|
|
entry:
|
|
; SSE2: shift4i32c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i32c
|
|
; SSE2-CODEGEN: psrld $3
|
|
|
|
%0 = lshr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
|
|
ret %shifttypec4i32 %0
|
|
}
|
|
|
|
%shifttypec8i32 = type <8 x i32>
|
|
define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
|
|
entry:
|
|
; SSE2: shift8i32c
|
|
; SSE2: cost of 2 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i32c
|
|
; SSE2-CODEGEN: psrld $3
|
|
|
|
%0 = lshr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3>
|
|
ret %shifttypec8i32 %0
|
|
}
|
|
|
|
%shifttypec16i32 = type <16 x i32>
|
|
define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
|
|
entry:
|
|
; SSE2: shift16i32c
|
|
; SSE2: cost of 4 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i32c
|
|
; SSE2-CODEGEN: psrld $3
|
|
|
|
%0 = lshr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3>
|
|
ret %shifttypec16i32 %0
|
|
}
|
|
|
|
%shifttypec32i32 = type <32 x i32>
|
|
define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
|
|
entry:
|
|
; SSE2: shift32i32c
|
|
; SSE2: cost of 8 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i32c
|
|
; SSE2-CODEGEN: psrld $3
|
|
%0 = lshr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3,
|
|
i32 3, i32 3, i32 3, i32 3>
|
|
ret %shifttypec32i32 %0
|
|
}
|
|
|
|
%shifttypec2i64 = type <2 x i64>
|
|
define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
|
|
entry:
|
|
; SSE2: shift2i64c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i64c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec2i64 %a , <i64 3, i64 3>
|
|
ret %shifttypec2i64 %0
|
|
}
|
|
|
|
%shifttypec4i64 = type <4 x i64>
|
|
define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
|
|
entry:
|
|
; SSE2: shift4i64c
|
|
; SSE2: cost of 2 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i64c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
|
|
ret %shifttypec4i64 %0
|
|
}
|
|
|
|
%shifttypec8i64 = type <8 x i64>
|
|
define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
|
|
entry:
|
|
; SSE2: shift8i64c
|
|
; SSE2: cost of 4 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i64c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3>
|
|
ret %shifttypec8i64 %0
|
|
}
|
|
|
|
%shifttypec16i64 = type <16 x i64>
|
|
define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
|
|
entry:
|
|
; SSE2: shift16i64c
|
|
; SSE2: cost of 8 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i64c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3>
|
|
ret %shifttypec16i64 %0
|
|
}
|
|
|
|
%shifttypec32i64 = type <32 x i64>
|
|
define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
|
|
entry:
|
|
; SSE2: shift32i64c
|
|
; SSE2: cost of 16 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i64c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3,
|
|
i64 3, i64 3, i64 3, i64 3>
|
|
ret %shifttypec32i64 %0
|
|
}
|
|
|
|
%shifttypec2i8 = type <2 x i8>
|
|
define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
|
|
entry:
|
|
; SSE2: shift2i8c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift2i8c
|
|
; SSE2-CODEGEN: psrlq $3
|
|
|
|
%0 = lshr %shifttypec2i8 %a , <i8 3, i8 3>
|
|
ret %shifttypec2i8 %0
|
|
}
|
|
|
|
%shifttypec4i8 = type <4 x i8>
|
|
define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
|
|
entry:
|
|
; SSE2: shift4i8c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift4i8c
|
|
; SSE2-CODEGEN: psrld $3
|
|
|
|
%0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
|
|
ret %shifttypec4i8 %0
|
|
}
|
|
|
|
%shifttypec8i8 = type <8 x i8>
|
|
define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
|
|
entry:
|
|
; SSE2: shift8i8c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift8i8c
|
|
; SSE2-CODEGEN: psrlw $3
|
|
|
|
%0 = lshr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3>
|
|
ret %shifttypec8i8 %0
|
|
}
|
|
|
|
%shifttypec16i8 = type <16 x i8>
|
|
define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
|
|
entry:
|
|
; SSE2: shift16i8c
|
|
; SSE2: cost of 1 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift16i8c
|
|
; SSE2-CODEGEN: psrlw $3
|
|
|
|
%0 = lshr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3>
|
|
ret %shifttypec16i8 %0
|
|
}
|
|
|
|
%shifttypec32i8 = type <32 x i8>
|
|
define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
|
|
entry:
|
|
; SSE2: shift32i8c
|
|
; SSE2: cost of 2 {{.*}} lshr
|
|
; SSE2-CODEGEN: shift32i8c
|
|
; SSE2-CODEGEN: psrlw $3
|
|
|
|
%0 = lshr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3,
|
|
i8 3, i8 3, i8 3, i8 3>
|
|
ret %shifttypec32i8 %0
|
|
}
|