2015-02-09 17:17:09 +00:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck -check-prefix=AVX2 %s
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s
|
[x86] Add vector @llvm.ctpop intrinsic custom lowering
Currently, when ctpop is supported for scalar types, the expansion of
@llvm.ctpop.vXiY uses vector element extractions, insertions and individual
calls to @llvm.ctpop.iY. When not, expansion with bit-math operations is used
for the scalar calls.
Local haswell measurements show that we can improve vector @llvm.ctpop.vXiY
expansion in some cases by using a using a vector parallel bit twiddling
approach, based on:
v = v - ((v >> 1) & 0x55555555);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
v = ((v + (v >> 4) & 0xF0F0F0F)
v = v + (v >> 8)
v = v + (v >> 16)
v = v & 0x0000003F
(from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel)
When scalar ctpop isn't supported, the approach above performs better for
v2i64, v4i32, v4i64 and v8i32 (see numbers below). And even when scalar ctpop
is supported, this approach performs ~2x better for v8i32.
Here, x86_64 implies -march=corei7-avx without ctpop and x86_64h includes ctpop
support with -march=core-avx2.
== [x86_64h - new]
v8i32: 0.661685
v4i32: 0.514678
v4i64: 0.652009
v2i64: 0.324289
== [x86_64h - old]
v8i32: 1.29578
v4i32: 0.528807
v4i64: 0.65981
v2i64: 0.330707
== [x86_64 - new]
v8i32: 1.003
v4i32: 0.656273
v4i64: 1.11711
v2i64: 0.754064
== [x86_64 - old]
v8i32: 2.34886
v4i32: 1.72053
v4i64: 1.41086
v2i64: 1.0244
More work for other vector types will come next.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224725 91177308-0d34-0410-b5e6-96231b3b80d8
2014-12-22 19:45:43 +00:00
|
|
|
|
|
|
|
; Vector version of:
|
|
|
|
; v = v - ((v >> 1) & 0x55555555)
|
|
|
|
; v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
|
|
|
|
; v = (v + (v >> 4) & 0xF0F0F0F)
|
|
|
|
; v = v + (v >> 8)
|
|
|
|
; v = v + (v >> 16)
|
|
|
|
; v = v + (v >> 32) ; i64 only
|
|
|
|
|
|
|
|
define <8 x i32> @test0(<8 x i32> %x) {
|
|
|
|
; AVX2-LABEL: @test0
|
|
|
|
entry:
|
|
|
|
; AVX2: vpsrld $1, %ymm
|
|
|
|
; AVX2-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NEXT: vpand
|
|
|
|
; AVX2-NEXT: vpsubd
|
|
|
|
; AVX2-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NEXT: vpand
|
|
|
|
; AVX2-NEXT: vpsrld $2
|
|
|
|
; AVX2-NEXT: vpand
|
|
|
|
; AVX2-NEXT: vpaddd
|
|
|
|
; AVX2-NEXT: vpsrld $4
|
|
|
|
; AVX2-NEXT: vpaddd
|
|
|
|
; AVX2-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NEXT: vpand
|
|
|
|
; AVX2-NEXT: vpsrld $8
|
|
|
|
; AVX2-NEXT: vpaddd
|
|
|
|
; AVX2-NEXT: vpsrld $16
|
|
|
|
; AVX2-NEXT: vpaddd
|
|
|
|
; AVX2-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NEXT: vpand
|
|
|
|
%y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x)
|
|
|
|
ret <8 x i32> %y
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i64> @test1(<4 x i64> %x) {
|
|
|
|
; AVX2-NOPOPCNT-LABEL: @test1
|
|
|
|
entry:
|
|
|
|
; AVX2-NOPOPCNT: vpsrlq $1, %ymm
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsubq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $2
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $4
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $8
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $16
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $32
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
%y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x)
|
|
|
|
ret <4 x i64> %y
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test2(<4 x i32> %x) {
|
|
|
|
; AVX2-NOPOPCNT-LABEL: @test2
|
|
|
|
; AVX1-NOPOPCNT-LABEL: @test2
|
|
|
|
entry:
|
|
|
|
; AVX2-NOPOPCNT: vpsrld $1, %xmm
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsubd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrld $2
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrld $4
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrld $8
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrld $16
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpbroadcastd
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT: vpsrld $1, %xmm
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsubd
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vmovdqa
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrld $2
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrld $4
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrld $8
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrld $16
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddd
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
%y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
|
|
|
|
ret <4 x i32> %y
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test3(<2 x i64> %x) {
|
|
|
|
; AVX2-NOPOPCNT-LABEL: @test3
|
|
|
|
; AVX1-NOPOPCNT-LABEL: @test3
|
|
|
|
entry:
|
|
|
|
; AVX2-NOPOPCNT: vpsrlq $1, %xmm
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsubq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vmovdqa
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $2
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $4
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $8
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $16
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpsrlq $32
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX2-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT: vpsrlq $1, %xmm
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsubq
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vmovdqa
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrlq $2
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrlq $4
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrlq $8
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrlq $16
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpsrlq $32
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpaddq
|
|
|
|
; AVX1-NOPOPCNT-NEXT: vpand
|
|
|
|
%y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
|
|
|
|
ret <2 x i64> %y
|
|
|
|
}
|
|
|
|
|
|
|
|
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
|
|
|
|
declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
|
|
|
|
|
|
|
|
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
|
|
|
|
declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
|
|
|
|
|