1
0
mirror of https://github.com/c64scene-ar/llvm-6502.git synced 2025-04-22 00:37:49 +00:00

Merge a bunch of NEON tests into larger files so they run faster.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@83667 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bob Wilson 2009-10-09 20:20:54 +00:00
parent 91e69c3715
commit 83815aeb29
87 changed files with 3526 additions and 3650 deletions

@ -135,3 +135,71 @@ declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) no
declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vabals8:
;CHECK: vabal.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vabals16:
;CHECK: vabal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vabals32:
;CHECK: vabal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vabalu8:
;CHECK: vabal.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vabalu16:
;CHECK: vabal.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vabalu32:
;CHECK: vabal.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,69 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vabals8:
;CHECK: vabal.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vabals16:
;CHECK: vabal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vabals32:
;CHECK: vabal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vabalu8:
;CHECK: vabal.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vabalu16:
;CHECK: vabal.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vabalu32:
;CHECK: vabal.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -145,3 +145,65 @@ declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind read
declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vabdls8:
;CHECK: vabdl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vabdls16:
;CHECK: vabdl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vabdls32:
;CHECK: vabdl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vabdlu8:
;CHECK: vabdl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vabdlu16:
;CHECK: vabdl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vabdlu32:
;CHECK: vabdl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,63 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vabdls8:
;CHECK: vabdl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vabdls16:
;CHECK: vabdl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vabdls32:
;CHECK: vabdl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vabdlu8:
;CHECK: vabdl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vabdlu16:
;CHECK: vabdl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vabdlu32:
;CHECK: vabdl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -74,3 +74,58 @@ declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float>) nounwind readnone
define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
;CHECK: vqabss8:
;CHECK: vqabs.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind {
;CHECK: vqabss16:
;CHECK: vqabs.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind {
;CHECK: vqabss32:
;CHECK: vqabs.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind {
;CHECK: vqabsQs8:
;CHECK: vqabs.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind {
;CHECK: vqabsQs16:
;CHECK: vqabs.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind {
;CHECK: vqabsQs32:
;CHECK: vqabs.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone

@ -1,22 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vacgef32:
;CHECK: vacge.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vacgeQf32:
;CHECK: vacge.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone

@ -1,22 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vacgtf32:
;CHECK: vacgt.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vacgtQf32:
;CHECK: vacgt.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone

@ -89,3 +89,189 @@ define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp3 = add <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}
define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vaddhni16:
;CHECK: vaddhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vaddhni32:
;CHECK: vaddhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vaddhni64:
;CHECK: vaddhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vraddhni16:
;CHECK: vraddhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vraddhni32:
;CHECK: vraddhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vraddhni64:
;CHECK: vraddhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddls8:
;CHECK: vaddl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddls16:
;CHECK: vaddl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddls32:
;CHECK: vaddl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddlu8:
;CHECK: vaddl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddlu16:
;CHECK: vaddl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddlu32:
;CHECK: vaddl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddws8:
;CHECK: vaddw.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddws16:
;CHECK: vaddw.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddws32:
;CHECK: vaddw.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddwu8:
;CHECK: vaddw.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddwu16:
;CHECK: vaddw.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddwu32:
;CHECK: vaddw.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone

@ -1,32 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vaddhni16:
;CHECK: vaddhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vaddhni32:
;CHECK: vaddhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vaddhni64:
;CHECK: vaddhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,63 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddls8:
;CHECK: vaddl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddls16:
;CHECK: vaddl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddls32:
;CHECK: vaddl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddlu8:
;CHECK: vaddl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddlu16:
;CHECK: vaddl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddlu32:
;CHECK: vaddl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,63 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddws8:
;CHECK: vaddw.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddws16:
;CHECK: vaddw.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddws32:
;CHECK: vaddw.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vaddwu8:
;CHECK: vaddw.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vaddwu16:
;CHECK: vaddw.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vaddwu32:
;CHECK: vaddw.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone

@ -1,73 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_andi8:
;CHECK: vand
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = and <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_andi16:
;CHECK: vand
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = and <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_andi32:
;CHECK: vand
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = and <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_andi64:
;CHECK: vand
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = and <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_andQi8:
;CHECK: vand
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = and <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_andQi16:
;CHECK: vand
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = and <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_andQi32:
;CHECK: vand
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = and <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_andQi64:
;CHECK: vand
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = and <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}

@ -1,81 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_bici8:
;CHECK: vbic
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = and <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_bici16:
;CHECK: vbic
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = and <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_bici32:
;CHECK: vbic
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
%tmp4 = and <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_bici64:
;CHECK: vbic
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
%tmp4 = and <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_bicQi8:
;CHECK: vbic
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = and <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_bicQi16:
;CHECK: vbic
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = and <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_bicQi32:
;CHECK: vbic
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp4 = and <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_bicQi64:
;CHECK: vbic
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
%tmp4 = and <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}

507
test/CodeGen/ARM/vbits.ll Normal file

@ -0,0 +1,507 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_andi8:
;CHECK: vand
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = and <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_andi16:
;CHECK: vand
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = and <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_andi32:
;CHECK: vand
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = and <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_andi64:
;CHECK: vand
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = and <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_andQi8:
;CHECK: vand
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = and <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_andQi16:
;CHECK: vand
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = and <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_andQi32:
;CHECK: vand
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = and <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_andQi64:
;CHECK: vand
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = and <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_bici8:
;CHECK: vbic
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = and <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_bici16:
;CHECK: vbic
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = and <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_bici32:
;CHECK: vbic
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
%tmp4 = and <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_bici64:
;CHECK: vbic
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
%tmp4 = and <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_bicQi8:
;CHECK: vbic
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = and <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_bicQi16:
;CHECK: vbic
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = and <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_bicQi32:
;CHECK: vbic
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp4 = and <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_bicQi64:
;CHECK: vbic
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
%tmp4 = and <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_eori8:
;CHECK: veor
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_eori16:
;CHECK: veor
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_eori32:
;CHECK: veor
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_eori64:
;CHECK: veor
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_eorQi8:
;CHECK: veor
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_eorQi16:
;CHECK: veor
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_eorQi32:
;CHECK: veor
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_eorQi64:
;CHECK: veor
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind {
;CHECK: v_mvni8:
;CHECK: vmvn
%tmp1 = load <8 x i8>* %A
%tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind {
;CHECK: v_mvni16:
;CHECK: vmvn
%tmp1 = load <4 x i16>* %A
%tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind {
;CHECK: v_mvni32:
;CHECK: vmvn
%tmp1 = load <2 x i32>* %A
%tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
ret <2 x i32> %tmp2
}
define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind {
;CHECK: v_mvni64:
;CHECK: vmvn
%tmp1 = load <1 x i64>* %A
%tmp2 = xor <1 x i64> %tmp1, < i64 -1 >
ret <1 x i64> %tmp2
}
define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind {
;CHECK: v_mvnQi8:
;CHECK: vmvn
%tmp1 = load <16 x i8>* %A
%tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind {
;CHECK: v_mvnQi16:
;CHECK: vmvn
%tmp1 = load <8 x i16>* %A
%tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind {
;CHECK: v_mvnQi32:
;CHECK: vmvn
%tmp1 = load <4 x i32>* %A
%tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
ret <4 x i32> %tmp2
}
define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind {
;CHECK: v_mvnQi64:
;CHECK: vmvn
%tmp1 = load <2 x i64>* %A
%tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
ret <2 x i64> %tmp2
}
define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_orri8:
;CHECK: vorr
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = or <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_orri16:
;CHECK: vorr
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = or <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_orri32:
;CHECK: vorr
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = or <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_orri64:
;CHECK: vorr
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = or <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_orrQi8:
;CHECK: vorr
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = or <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_orrQi16:
;CHECK: vorr
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = or <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_orrQi32:
;CHECK: vorr
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = or <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_orrQi64:
;CHECK: vorr
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = or <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_orni8:
;CHECK: vorn
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = or <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_orni16:
;CHECK: vorn
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = or <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_orni32:
;CHECK: vorn
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
%tmp4 = or <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_orni64:
;CHECK: vorn
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
%tmp4 = or <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_ornQi8:
;CHECK: vorn
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = or <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_ornQi16:
;CHECK: vorn
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = or <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_ornQi32:
;CHECK: vorn
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp4 = or <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_ornQi64:
;CHECK: vorn
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
%tmp4 = or <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vtsti8:
;CHECK: vtst.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = and <8 x i8> %tmp1, %tmp2
%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
%tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
ret <8 x i8> %tmp5
}
define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vtsti16:
;CHECK: vtst.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = and <4 x i16> %tmp1, %tmp2
%tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
%tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
ret <4 x i16> %tmp5
}
define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vtsti32:
;CHECK: vtst.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = and <2 x i32> %tmp1, %tmp2
%tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
%tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
ret <2 x i32> %tmp5
}
define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vtstQi8:
;CHECK: vtst.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = and <16 x i8> %tmp1, %tmp2
%tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
%tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
ret <16 x i8> %tmp5
}
define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vtstQi16:
;CHECK: vtst.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = and <8 x i16> %tmp1, %tmp2
%tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
%tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
ret <8 x i16> %tmp5
}
define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vtstQi32:
;CHECK: vtst.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = and <4 x i32> %tmp1, %tmp2
%tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
%tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
ret <4 x i32> %tmp5
}

@ -139,3 +139,24 @@ define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vacgef32:
;CHECK: vacge.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vacgeQf32:
;CHECK: vacge.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone

@ -139,3 +139,24 @@ define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vacgtf32:
;CHECK: vacgt.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vacgtQf32:
;CHECK: vacgt.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone

@ -1,57 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
;CHECK: vclss8:
;CHECK: vcls.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
;CHECK: vclss16:
;CHECK: vcls.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
;CHECK: vclss32:
;CHECK: vcls.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
;CHECK: vclsQs8:
;CHECK: vcls.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
;CHECK: vclsQs16:
;CHECK: vcls.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
;CHECK: vclsQs32:
;CHECK: vcls.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone

@ -1,57 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
;CHECK: vclz8:
;CHECK: vclz.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
;CHECK: vclz16:
;CHECK: vclz.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
;CHECK: vclz32:
;CHECK: vclz.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
;CHECK: vclzQ8:
;CHECK: vclz.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
;CHECK: vclzQ16:
;CHECK: vclz.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
;CHECK: vclzQ32:
;CHECK: vclz.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone

@ -18,3 +18,115 @@ define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
declare <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
;CHECK: vclz8:
;CHECK: vclz.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
;CHECK: vclz16:
;CHECK: vclz.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
;CHECK: vclz32:
;CHECK: vclz.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
;CHECK: vclzQ8:
;CHECK: vclz.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
;CHECK: vclzQ16:
;CHECK: vclz.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
;CHECK: vclzQ32:
;CHECK: vclz.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
;CHECK: vclss8:
;CHECK: vcls.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
;CHECK: vclss16:
;CHECK: vcls.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
;CHECK: vclss32:
;CHECK: vcls.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
;CHECK: vclsQs8:
;CHECK: vcls.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
;CHECK: vclsQs16:
;CHECK: vcls.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
;CHECK: vclsQs32:
;CHECK: vcls.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone

@ -63,3 +63,78 @@ define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
%tmp2 = uitofp <4 x i32> %tmp1 to <4 x float>
ret <4 x float> %tmp2
}
define <2 x i32> @vcvt_n_f32tos32(<2 x float>* %A) nounwind {
;CHECK: vcvt_n_f32tos32:
;CHECK: vcvt.s32.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
ret <2 x i32> %tmp2
}
define <2 x i32> @vcvt_n_f32tou32(<2 x float>* %A) nounwind {
;CHECK: vcvt_n_f32tou32:
;CHECK: vcvt.u32.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
ret <2 x i32> %tmp2
}
define <2 x float> @vcvt_n_s32tof32(<2 x i32>* %A) nounwind {
;CHECK: vcvt_n_s32tof32:
;CHECK: vcvt.f32.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
ret <2 x float> %tmp2
}
define <2 x float> @vcvt_n_u32tof32(<2 x i32>* %A) nounwind {
;CHECK: vcvt_n_u32tof32:
;CHECK: vcvt.f32.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
ret <2 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
define <4 x i32> @vcvtQ_n_f32tos32(<4 x float>* %A) nounwind {
;CHECK: vcvtQ_n_f32tos32:
;CHECK: vcvt.s32.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
ret <4 x i32> %tmp2
}
define <4 x i32> @vcvtQ_n_f32tou32(<4 x float>* %A) nounwind {
;CHECK: vcvtQ_n_f32tou32:
;CHECK: vcvt.u32.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
ret <4 x i32> %tmp2
}
define <4 x float> @vcvtQ_n_s32tof32(<4 x i32>* %A) nounwind {
;CHECK: vcvtQ_n_s32tof32:
;CHECK: vcvt.f32.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
ret <4 x float> %tmp2
}
define <4 x float> @vcvtQ_n_u32tof32(<4 x i32>* %A) nounwind {
;CHECK: vcvtQ_n_u32tof32:
;CHECK: vcvt.f32.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
ret <4 x float> %tmp2
}
declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone

@ -1,76 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
;CHECK: vcvt_f32tos32:
;CHECK: vcvt.s32.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
ret <2 x i32> %tmp2
}
define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
;CHECK: vcvt_f32tou32:
;CHECK: vcvt.u32.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
ret <2 x i32> %tmp2
}
define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
;CHECK: vcvt_s32tof32:
;CHECK: vcvt.f32.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
ret <2 x float> %tmp2
}
define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
;CHECK: vcvt_u32tof32:
;CHECK: vcvt.f32.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
ret <2 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
;CHECK: vcvtQ_f32tos32:
;CHECK: vcvt.s32.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
ret <4 x i32> %tmp2
}
define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
;CHECK: vcvtQ_f32tou32:
;CHECK: vcvt.u32.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
ret <4 x i32> %tmp2
}
define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
;CHECK: vcvtQ_s32tof32:
;CHECK: vcvt.f32.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
ret <4 x float> %tmp2
}
define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
;CHECK: vcvtQ_u32tof32:
;CHECK: vcvt.f32.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
ret <4 x float> %tmp2
}
declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone

@ -179,3 +179,91 @@ define <4 x float> @v_shuffledupQfloat2(float* %A) nounwind {
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %tmp2
}
define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
;CHECK: vduplane8:
;CHECK: vdup.8
%tmp1 = load <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
;CHECK: vduplane16:
;CHECK: vdup.16
%tmp1 = load <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
;CHECK: vduplane32:
;CHECK: vdup.32
%tmp1 = load <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
}
define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
;CHECK: vduplanefloat:
;CHECK: vdup.32
%tmp1 = load <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
}
define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
;CHECK: vduplaneQ8:
;CHECK: vdup.8
%tmp1 = load <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
;CHECK: vduplaneQ16:
;CHECK: vdup.16
%tmp1 = load <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
;CHECK: vduplaneQ32:
;CHECK: vdup.32
%tmp1 = load <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
}
define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
;CHECK: vduplaneQfloat:
;CHECK: vdup.32
%tmp1 = load <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
}
define arm_apcscc <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %0
}
define arm_apcscc <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
ret <2 x i64> %0
}
define arm_apcscc <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %0
}
define arm_apcscc <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
ret <2 x double> %0
}

@ -1,89 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
;CHECK: vduplane8:
;CHECK: vdup.8
%tmp1 = load <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
;CHECK: vduplane16:
;CHECK: vdup.16
%tmp1 = load <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
;CHECK: vduplane32:
;CHECK: vdup.32
%tmp1 = load <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
}
define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
;CHECK: vduplanefloat:
;CHECK: vdup.32
%tmp1 = load <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
}
define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
;CHECK: vduplaneQ8:
;CHECK: vdup.8
%tmp1 = load <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
;CHECK: vduplaneQ16:
;CHECK: vdup.16
%tmp1 = load <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
;CHECK: vduplaneQ32:
;CHECK: vdup.32
%tmp1 = load <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
}
define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
;CHECK: vduplaneQfloat:
;CHECK: vdup.32
%tmp1 = load <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
}
define arm_apcscc <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %0
}
define arm_apcscc <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
ret <2 x i64> %0
}
define arm_apcscc <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %0
}
define arm_apcscc <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
entry:
%0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
ret <2 x double> %0
}

@ -1,73 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_eori8:
;CHECK: veor
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_eori16:
;CHECK: veor
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_eori32:
;CHECK: veor
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_eori64:
;CHECK: veor
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_eorQi8:
;CHECK: veor
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_eorQi16:
;CHECK: veor
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_eorQi32:
;CHECK: veor
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_eorQi64:
;CHECK: veor
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}

@ -1,4 +1,6 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
; RUN: llc < %s -mattr=+neon | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
;CHECK: vget_lanes8:
@ -91,3 +93,120 @@ define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
%tmp3 = extractelement <4 x i32> %tmp2, i32 1
ret i32 %tmp3
}
define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
entry:
; CHECK: vmov.u16 r0, d0[1]
%arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1]
%1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1]
store i16 %1, i16* %out_uint16_t, align 2
br label %return
return: ; preds = %entry
ret void
}
define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
entry:
; CHECK: vmov.u8 r0, d0[1]
%arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1]
%1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1]
store i8 %1, i8* %out_uint8_t, align 1
br label %return
return: ; preds = %entry
ret void
}
define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
entry:
; CHECK: vmov.u16 r0, d0[1]
%arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1]
%1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1]
store i16 %1, i16* %out_uint16_t, align 2
br label %return
return: ; preds = %entry
ret void
}
define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
entry:
; CHECK: vmov.u8 r0, d0[1]
%arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1]
%1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1]
store i8 %1, i8* %out_uint8_t, align 1
br label %return
return: ; preds = %entry
ret void
}
define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
;CHECK: vset_lane8:
;CHECK: vmov.8
%tmp1 = load <8 x i8>* %A
%tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
ret <8 x i8> %tmp2
}
define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
;CHECK: vset_lane16:
;CHECK: vmov.16
%tmp1 = load <4 x i16>* %A
%tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
ret <4 x i16> %tmp2
}
define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
;CHECK: vset_lane32:
;CHECK: vmov.32
%tmp1 = load <2 x i32>* %A
%tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
ret <2 x i32> %tmp2
}
define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
;CHECK: vsetQ_lane8:
;CHECK: vmov.8
%tmp1 = load <16 x i8>* %A
%tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
ret <16 x i8> %tmp2
}
define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
;CHECK: vsetQ_lane16:
;CHECK: vmov.16
%tmp1 = load <8 x i16>* %A
%tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
ret <8 x i16> %tmp2
}
define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
;CHECK: vsetQ_lane32:
;CHECK: vmov.32
%tmp1 = load <4 x i32>* %A
%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
ret <4 x i32> %tmp2
}
define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind {
;CHECK: test_vset_lanef32:
;CHECK: fcpys
;CHECK: fcpys
entry:
%0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1]
ret <2 x float> %0
}

@ -1,63 +0,0 @@
; RUN: llc < %s -mattr=+neon | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
entry:
; CHECK: vmov.u16 r0, d0[1]
%arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1]
%1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1]
store i16 %1, i16* %out_uint16_t, align 2
br label %return
return: ; preds = %entry
ret void
}
define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
entry:
; CHECK: vmov.u8 r0, d0[1]
%arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1]
%1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1]
store i8 %1, i8* %out_uint8_t, align 1
br label %return
return: ; preds = %entry
ret void
}
define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
entry:
; CHECK: vmov.u16 r0, d0[1]
%arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1]
%1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1]
store i16 %1, i16* %out_uint16_t, align 2
br label %return
return: ; preds = %entry
ret void
}
define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
entry:
; CHECK: vmov.u8 r0, d0[1]
%arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
%0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1]
%1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1]
store i8 %1, i8* %out_uint8_t, align 1
br label %return
return: ; preds = %entry
ret void
}

@ -123,3 +123,127 @@ declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind rea
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrhadds8:
;CHECK: vrhadd.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrhadds16:
;CHECK: vrhadd.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrhadds32:
;CHECK: vrhadd.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrhaddu8:
;CHECK: vrhadd.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrhaddu16:
;CHECK: vrhadd.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrhaddu32:
;CHECK: vrhadd.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrhaddQs8:
;CHECK: vrhadd.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrhaddQs16:
;CHECK: vrhadd.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrhaddQs32:
;CHECK: vrhadd.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrhaddQu8:
;CHECK: vrhadd.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrhaddQu16:
;CHECK: vrhadd.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrhaddQu32:
;CHECK: vrhadd.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

@ -1,147 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmins8:
;CHECK: vmin.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmins16:
;CHECK: vmin.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmins32:
;CHECK: vmin.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vminu8:
;CHECK: vmin.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vminu16:
;CHECK: vmin.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vminu32:
;CHECK: vmin.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vminf32:
;CHECK: vmin.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vminQs8:
;CHECK: vmin.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vminQs16:
;CHECK: vmin.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vminQs32:
;CHECK: vmin.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vminQu8:
;CHECK: vmin.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vminQu16:
;CHECK: vmin.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vminQu32:
;CHECK: vmin.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vminQf32:
;CHECK: vmin.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone

@ -1,5 +1,151 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmins8:
;CHECK: vmin.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmins16:
;CHECK: vmin.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmins32:
;CHECK: vmin.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vminu8:
;CHECK: vmin.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vminu16:
;CHECK: vmin.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vminu32:
;CHECK: vmin.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vminf32:
;CHECK: vmin.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vminQs8:
;CHECK: vmin.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vminQs16:
;CHECK: vmin.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vminQs32:
;CHECK: vmin.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vminQu8:
;CHECK: vmin.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vminQu16:
;CHECK: vmin.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vminQu32:
;CHECK: vmin.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vminQf32:
;CHECK: vmin.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmaxs8:
;CHECK: vmax.s8

@ -87,3 +87,107 @@ define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C)
%tmp5 = add <4 x float> %tmp1, %tmp4
ret <4 x float> %tmp5
}
define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlals8:
;CHECK: vmlal.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlals16:
;CHECK: vmlal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlals32:
;CHECK: vmlal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlalu8:
;CHECK: vmlal.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlalu16:
;CHECK: vmlal.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlalu32:
;CHECK: vmlal.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlal_lanes16
; CHECK: vmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlal_lanes32
; CHECK: vmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlal_laneu16
; CHECK: vmlal.u16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlal_laneu32
; CHECK: vmlal.u32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,69 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlals8:
;CHECK: vmlal.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlals16:
;CHECK: vmlal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlals32:
;CHECK: vmlal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlalu8:
;CHECK: vmlal.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlalu16:
;CHECK: vmlal.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlalu32:
;CHECK: vmlal.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,47 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlal_lanes16
; CHECK: vmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlal_lanes32
; CHECK: vmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlal_laneu16
; CHECK: vmlal.u16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlal_laneu32
; CHECK: vmlal.u32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -87,3 +87,107 @@ define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C)
%tmp5 = sub <4 x float> %tmp1, %tmp4
ret <4 x float> %tmp5
}
define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlsls8:
;CHECK: vmlsl.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlsls16:
;CHECK: vmlsl.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlsls32:
;CHECK: vmlsl.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlslu8:
;CHECK: vmlsl.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlslu16:
;CHECK: vmlsl.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlslu32:
;CHECK: vmlsl.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_lanes16
; CHECK: vmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_lanes32
; CHECK: vmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_laneu16
; CHECK: vmlsl.u16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_laneu32
; CHECK: vmlsl.u32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,69 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlsls8:
;CHECK: vmlsl.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlsls16:
;CHECK: vmlsl.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlsls32:
;CHECK: vmlsl.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
;CHECK: vmlslu8:
;CHECK: vmlsl.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vmlslu16:
;CHECK: vmlsl.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vmlslu32:
;CHECK: vmlsl.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,47 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_lanes16
; CHECK: vmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_lanes32
; CHECK: vmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_laneu16
; CHECK: vmlsl.u16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmlsl_laneu32
; CHECK: vmlsl.u32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -133,3 +133,171 @@ define <2 x i64> @v_movQi64() nounwind {
;CHECK: vmov.i64
ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
}
define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
;CHECK: vmovls8:
;CHECK: vmovl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
;CHECK: vmovls16:
;CHECK: vmovl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
;CHECK: vmovls32:
;CHECK: vmovl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
ret <2 x i64> %tmp2
}
define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
;CHECK: vmovlu8:
;CHECK: vmovl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
;CHECK: vmovlu16:
;CHECK: vmovl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
;CHECK: vmovlu32:
;CHECK: vmovl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
ret <2 x i64> %tmp2
}
declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone
define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
;CHECK: vmovni16:
;CHECK: vmovn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
;CHECK: vmovni32:
;CHECK: vmovn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
;CHECK: vmovni64:
;CHECK: vmovn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone
define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind {
;CHECK: vqmovns16:
;CHECK: vqmovn.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind {
;CHECK: vqmovns32:
;CHECK: vqmovn.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind {
;CHECK: vqmovns64:
;CHECK: vqmovn.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind {
;CHECK: vqmovnu16:
;CHECK: vqmovn.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind {
;CHECK: vqmovnu32:
;CHECK: vqmovn.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind {
;CHECK: vqmovnu64:
;CHECK: vqmovn.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind {
;CHECK: vqmovuns16:
;CHECK: vqmovun.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind {
;CHECK: vqmovuns32:
;CHECK: vqmovun.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind {
;CHECK: vqmovuns64:
;CHECK: vqmovun.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone

@ -1,57 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
;CHECK: vmovls8:
;CHECK: vmovl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
;CHECK: vmovls16:
;CHECK: vmovl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
;CHECK: vmovls32:
;CHECK: vmovl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
ret <2 x i64> %tmp2
}
define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
;CHECK: vmovlu8:
;CHECK: vmovl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
;CHECK: vmovlu16:
;CHECK: vmovl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
;CHECK: vmovlu32:
;CHECK: vmovl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
ret <2 x i64> %tmp2
}
declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone

@ -1,29 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
;CHECK: vmovni16:
;CHECK: vmovn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
;CHECK: vmovni32:
;CHECK: vmovn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
;CHECK: vmovni64:
;CHECK: vmovn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone

@ -92,3 +92,166 @@ define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanef32:
; CHECK: vmul.f32 d0, d0, d1[0]
%0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
%1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1]
ret <2 x float> %1
}
define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanes16:
; CHECK: vmul.i16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
%1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanes32:
; CHECK: vmul.i32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanef32:
; CHECK: vmul.f32 q0, q0, d2[1]
%0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
%1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1]
ret <4 x float> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanes16:
; CHECK: vmul.i16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanes32:
; CHECK: vmul.i32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
%1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmulls8:
;CHECK: vmull.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmulls16:
;CHECK: vmull.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmulls32:
;CHECK: vmull.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullu8:
;CHECK: vmull.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmullu16:
;CHECK: vmull.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmullu32:
;CHECK: vmull.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullp8:
;CHECK: vmull.p8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes16
; CHECK: vmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes32
; CHECK: vmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu16
; CHECK: vmull.u16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu32
; CHECK: vmull.u32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone

@ -1,57 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanef32:
; CHECK: vmul.f32 d0, d0, d1[0]
%0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
%1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1]
ret <2 x float> %1
}
define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanes16:
; CHECK: vmul.i16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
%1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmul_lanes32:
; CHECK: vmul.i32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanef32:
; CHECK: vmul.f32 q0, q0, d2[1]
%0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
%1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1]
ret <4 x float> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanes16:
; CHECK: vmul.i16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmulQ_lanes32:
; CHECK: vmul.i32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
%1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}

@ -1,74 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmulls8:
;CHECK: vmull.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmulls16:
;CHECK: vmull.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmulls32:
;CHECK: vmull.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullu8:
;CHECK: vmull.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmullu16:
;CHECK: vmull.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmullu32:
;CHECK: vmull.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullp8:
;CHECK: vmull.p8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone

@ -1,47 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes16
; CHECK: vmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes32
; CHECK: vmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu16
; CHECK: vmull.u16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu32
; CHECK: vmull.u32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,65 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind {
;CHECK: v_mvni8:
;CHECK: vmvn
%tmp1 = load <8 x i8>* %A
%tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind {
;CHECK: v_mvni16:
;CHECK: vmvn
%tmp1 = load <4 x i16>* %A
%tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind {
;CHECK: v_mvni32:
;CHECK: vmvn
%tmp1 = load <2 x i32>* %A
%tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
ret <2 x i32> %tmp2
}
define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind {
;CHECK: v_mvni64:
;CHECK: vmvn
%tmp1 = load <1 x i64>* %A
%tmp2 = xor <1 x i64> %tmp1, < i64 -1 >
ret <1 x i64> %tmp2
}
define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind {
;CHECK: v_mvnQi8:
;CHECK: vmvn
%tmp1 = load <16 x i8>* %A
%tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind {
;CHECK: v_mvnQi16:
;CHECK: vmvn
%tmp1 = load <8 x i16>* %A
%tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind {
;CHECK: v_mvnQi32:
;CHECK: vmvn
%tmp1 = load <4 x i32>* %A
%tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
ret <4 x i32> %tmp2
}
define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind {
;CHECK: v_mvnQi64:
;CHECK: vmvn
%tmp1 = load <2 x i64>* %A
%tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
ret <2 x i64> %tmp2
}

@ -63,3 +63,59 @@ define <4 x float> @vnegQf32(<4 x float>* %A) nounwind {
%tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1
ret <4 x float> %tmp2
}
define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind {
;CHECK: vqnegs8:
;CHECK: vqneg.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind {
;CHECK: vqnegs16:
;CHECK: vqneg.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind {
;CHECK: vqnegs32:
;CHECK: vqneg.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind {
;CHECK: vqnegQs8:
;CHECK: vqneg.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind {
;CHECK: vqnegQs16:
;CHECK: vqneg.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind {
;CHECK: vqnegQs32:
;CHECK: vqneg.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone

@ -1,81 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_orni8:
;CHECK: vorn
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = or <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_orni16:
;CHECK: vorn
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = or <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_orni32:
;CHECK: vorn
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
%tmp4 = or <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_orni64:
;CHECK: vorn
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
%tmp4 = or <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_ornQi8:
;CHECK: vorn
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = or <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_ornQi16:
;CHECK: vorn
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = or <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_ornQi32:
;CHECK: vorn
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp4 = or <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_ornQi64:
;CHECK: vorn
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
%tmp4 = or <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}

@ -1,73 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: v_orri8:
;CHECK: vorr
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = or <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: v_orri16:
;CHECK: vorr
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = or <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: v_orri32:
;CHECK: vorr
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = or <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: v_orri64:
;CHECK: vorr
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = or <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: v_orrQi8:
;CHECK: vorr
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = or <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: v_orrQi16:
;CHECK: vorr
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = or <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: v_orrQi32:
;CHECK: vorr
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = or <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: v_orrQi64:
;CHECK: vorr
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = or <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}

@ -41,3 +41,115 @@ declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind read
declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
;CHECK: vpaddls8:
;CHECK: vpaddl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
;CHECK: vpaddls16:
;CHECK: vpaddl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp2
}
define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
;CHECK: vpaddls32:
;CHECK: vpaddl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp2
}
define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
;CHECK: vpaddlu8:
;CHECK: vpaddl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
;CHECK: vpaddlu16:
;CHECK: vpaddl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp2
}
define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
;CHECK: vpaddlu32:
;CHECK: vpaddl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp2
}
define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
;CHECK: vpaddlQs8:
;CHECK: vpaddl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
;CHECK: vpaddlQs16:
;CHECK: vpaddl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
;CHECK: vpaddlQs32:
;CHECK: vpaddl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp2
}
define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
;CHECK: vpaddlQu8:
;CHECK: vpaddl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
;CHECK: vpaddlQu16:
;CHECK: vpaddl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
;CHECK: vpaddlQu32:
;CHECK: vpaddl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp2
}
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone

@ -1,113 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
;CHECK: vpaddls8:
;CHECK: vpaddl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
;CHECK: vpaddls16:
;CHECK: vpaddl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp2
}
define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
;CHECK: vpaddls32:
;CHECK: vpaddl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp2
}
define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
;CHECK: vpaddlu8:
;CHECK: vpaddl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
;CHECK: vpaddlu16:
;CHECK: vpaddl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp2
}
define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
;CHECK: vpaddlu32:
;CHECK: vpaddl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp2
}
define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
;CHECK: vpaddlQs8:
;CHECK: vpaddl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
;CHECK: vpaddlQs16:
;CHECK: vpaddl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
;CHECK: vpaddlQs32:
;CHECK: vpaddl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp2
}
define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
;CHECK: vpaddlQu8:
;CHECK: vpaddl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
;CHECK: vpaddlQu16:
;CHECK: vpaddl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
;CHECK: vpaddlQu32:
;CHECK: vpaddl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp2
}
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone

@ -1,74 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vpmaxs8:
;CHECK: vpmax.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vpmaxs16:
;CHECK: vpmax.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vpmaxs32:
;CHECK: vpmax.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vpmaxu8:
;CHECK: vpmax.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vpmaxu16:
;CHECK: vpmax.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vpmaxu32:
;CHECK: vpmax.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vpmaxf32:
;CHECK: vpmax.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone

@ -72,3 +72,76 @@ declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind rea
declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vpmaxs8:
;CHECK: vpmax.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vpmaxs16:
;CHECK: vpmax.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vpmaxs32:
;CHECK: vpmax.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vpmaxu8:
;CHECK: vpmax.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vpmaxu16:
;CHECK: vpmax.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vpmaxu32:
;CHECK: vpmax.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vpmaxf32:
;CHECK: vpmax.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone

@ -1,47 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulhQ_lanes16
; CHECK: vqrdmulh.s16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
%1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulhQ_lanes32
; CHECK: vqrdmulh.s32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulh_lanes16
; CHECK: vqrdmulh.s16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulh_lanes32
; CHECK: vqrdmulh.s32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,57 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
;CHECK: vqabss8:
;CHECK: vqabs.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind {
;CHECK: vqabss16:
;CHECK: vqabs.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind {
;CHECK: vqabss32:
;CHECK: vqabs.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind {
;CHECK: vqabsQs8:
;CHECK: vqabs.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind {
;CHECK: vqabsQs16:
;CHECK: vqabs.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind {
;CHECK: vqabsQs32:
;CHECK: vqabs.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone

@ -1,24 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vqdmlals16:
;CHECK: vqdmlal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vqdmlals32:
;CHECK: vqdmlal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,25 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmlal_lanes16
; CHECK: vqdmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmlal_lanes32
; CHECK: vqdmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,24 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vqdmlsls16:
;CHECK: vqdmlsl.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vqdmlsls32:
;CHECK: vqdmlsl.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,25 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmlsl_lanes16
; CHECK: vqdmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmlsl_lanes32
; CHECK: vqdmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

281
test/CodeGen/ARM/vqdmul.ll Normal file

@ -0,0 +1,281 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqdmulhs16:
;CHECK: vqdmulh.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqdmulhs32:
;CHECK: vqdmulh.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqdmulhQs16:
;CHECK: vqdmulh.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqdmulhQs32:
;CHECK: vqdmulh.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmulhQ_lanes16
; CHECK: vqdmulh.s16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
%1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmulhQ_lanes32
; CHECK: vqdmulh.s32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmulh_lanes16
; CHECK: vqdmulh.s16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmulh_lanes32
; CHECK: vqdmulh.s32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqrdmulhs16:
;CHECK: vqrdmulh.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqrdmulhs32:
;CHECK: vqrdmulh.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqrdmulhQs16:
;CHECK: vqrdmulh.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqrdmulhQs32:
;CHECK: vqrdmulh.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulhQ_lanes16
; CHECK: vqrdmulh.s16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
%1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulhQ_lanes32
; CHECK: vqrdmulh.s32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulh_lanes16
; CHECK: vqrdmulh.s16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqRdmulh_lanes32
; CHECK: vqrdmulh.s32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqdmulls16:
;CHECK: vqdmull.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqdmulls32:
;CHECK: vqdmull.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmull_lanes16
; CHECK: vqdmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmull_lanes32
; CHECK: vqdmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vqdmlals16:
;CHECK: vqdmlal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vqdmlals32:
;CHECK: vqdmlal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmlal_lanes16
; CHECK: vqdmlal.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmlal_lanes32
; CHECK: vqdmlal.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
;CHECK: vqdmlsls16:
;CHECK: vqdmlsl.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
;CHECK: vqdmlsls32:
;CHECK: vqdmlsl.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmlsl_lanes16
; CHECK: vqdmlsl.s16 q0, d2, d3[1]
%0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmlsl_lanes32
; CHECK: vqdmlsl.s32 q0, d2, d3[1]
%0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

@ -1,85 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqdmulhs16:
;CHECK: vqdmulh.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqdmulhs32:
;CHECK: vqdmulh.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqdmulhQs16:
;CHECK: vqdmulh.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqdmulhQs32:
;CHECK: vqdmulh.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqrdmulhs16:
;CHECK: vqrdmulh.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqrdmulhs32:
;CHECK: vqrdmulh.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqrdmulhQs16:
;CHECK: vqrdmulh.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqrdmulhQs32:
;CHECK: vqrdmulh.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

@ -1,47 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmulhQ_lanes16
; CHECK: vqdmulh.s16 q0, q0, d2[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
%1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %1
}
declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmulhQ_lanes32
; CHECK: vqdmulh.s32 q0, q0, d2[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmulh_lanes16
; CHECK: vqdmulh.s16 d0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
ret <4 x i16> %1
}
declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmulh_lanes32
; CHECK: vqdmulh.s32 d0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
ret <2 x i32> %1
}
declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,22 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqdmulls16:
;CHECK: vqdmull.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqdmulls32:
;CHECK: vqdmull.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,25 +0,0 @@
; RUN: llc -mattr=+neon < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
target triple = "thumbv7-elf"
define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vqdmull_lanes16
; CHECK: vqdmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vqdmull_lanes32
; CHECK: vqdmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,85 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind {
;CHECK: vqmovns16:
;CHECK: vqmovn.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind {
;CHECK: vqmovns32:
;CHECK: vqmovn.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind {
;CHECK: vqmovns64:
;CHECK: vqmovn.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind {
;CHECK: vqmovnu16:
;CHECK: vqmovn.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind {
;CHECK: vqmovnu32:
;CHECK: vqmovn.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind {
;CHECK: vqmovnu64:
;CHECK: vqmovn.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind {
;CHECK: vqmovuns16:
;CHECK: vqmovun.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind {
;CHECK: vqmovuns32:
;CHECK: vqmovun.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind {
;CHECK: vqmovuns64:
;CHECK: vqmovun.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone

@ -1,57 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind {
;CHECK: vqnegs8:
;CHECK: vqneg.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind {
;CHECK: vqnegs16:
;CHECK: vqneg.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind {
;CHECK: vqnegs32:
;CHECK: vqneg.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind {
;CHECK: vqnegQs8:
;CHECK: vqneg.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind {
;CHECK: vqnegQs16:
;CHECK: vqneg.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind {
;CHECK: vqnegQs32:
;CHECK: vqneg.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone

@ -1,165 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqrshls8:
;CHECK: vqrshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqrshls16:
;CHECK: vqrshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqrshls32:
;CHECK: vqrshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqrshls64:
;CHECK: vqrshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqrshlu8:
;CHECK: vqrshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqrshlu16:
;CHECK: vqrshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqrshlu32:
;CHECK: vqrshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqrshlu64:
;CHECK: vqrshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqrshlQs8:
;CHECK: vqrshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqrshlQs16:
;CHECK: vqrshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqrshlQs32:
;CHECK: vqrshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqrshlQs64:
;CHECK: vqrshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqrshlQu8:
;CHECK: vqrshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqrshlQu16:
;CHECK: vqrshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqrshlQu32:
;CHECK: vqrshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqrshlQu64:
;CHECK: vqrshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,85 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind {
;CHECK: vqrshrns8:
;CHECK: vqrshrn.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind {
;CHECK: vqrshrns16:
;CHECK: vqrshrn.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind {
;CHECK: vqrshrns32:
;CHECK: vqrshrn.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind {
;CHECK: vqrshrnu8:
;CHECK: vqrshrn.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind {
;CHECK: vqrshrnu16:
;CHECK: vqrshrn.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind {
;CHECK: vqrshrnu32:
;CHECK: vqrshrn.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind {
;CHECK: vqrshruns8:
;CHECK: vqrshrun.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind {
;CHECK: vqrshruns16:
;CHECK: vqrshrun.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind {
;CHECK: vqrshruns32:
;CHECK: vqrshrun.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -365,3 +365,167 @@ declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind
declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqrshls8:
;CHECK: vqrshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqrshls16:
;CHECK: vqrshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqrshls32:
;CHECK: vqrshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqrshls64:
;CHECK: vqrshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vqrshlu8:
;CHECK: vqrshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vqrshlu16:
;CHECK: vqrshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vqrshlu32:
;CHECK: vqrshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vqrshlu64:
;CHECK: vqrshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqrshlQs8:
;CHECK: vqrshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqrshlQs16:
;CHECK: vqrshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqrshlQs32:
;CHECK: vqrshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqrshlQs64:
;CHECK: vqrshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vqrshlQu8:
;CHECK: vqrshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vqrshlQu16:
;CHECK: vqrshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vqrshlQu32:
;CHECK: vqrshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vqrshlQu64:
;CHECK: vqrshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

@ -83,3 +83,87 @@ declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind
declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind {
;CHECK: vqrshrns8:
;CHECK: vqrshrn.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind {
;CHECK: vqrshrns16:
;CHECK: vqrshrn.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind {
;CHECK: vqrshrns32:
;CHECK: vqrshrn.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind {
;CHECK: vqrshrnu8:
;CHECK: vqrshrn.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind {
;CHECK: vqrshrnu16:
;CHECK: vqrshrn.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind {
;CHECK: vqrshrnu32:
;CHECK: vqrshrn.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind {
;CHECK: vqrshruns8:
;CHECK: vqrshrun.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind {
;CHECK: vqrshruns16:
;CHECK: vqrshrun.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind {
;CHECK: vqrshruns32:
;CHECK: vqrshrun.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,32 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vraddhni16:
;CHECK: vraddhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vraddhni32:
;CHECK: vraddhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vraddhni64:
;CHECK: vraddhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

119
test/CodeGen/ARM/vrec.ll Normal file

@ -0,0 +1,119 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
;CHECK: vrecpei32:
;CHECK: vrecpe.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind {
;CHECK: vrecpeQi32:
;CHECK: vrecpe.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
;CHECK: vrecpef32:
;CHECK: vrecpe.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
;CHECK: vrecpeQf32:
;CHECK: vrecpe.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vrecpsf32:
;CHECK: vrecps.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vrecpsQf32:
;CHECK: vrecps.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind {
;CHECK: vrsqrtei32:
;CHECK: vrsqrte.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind {
;CHECK: vrsqrteQi32:
;CHECK: vrsqrte.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
;CHECK: vrsqrtef32:
;CHECK: vrsqrte.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
;CHECK: vrsqrteQf32:
;CHECK: vrsqrte.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vrsqrtsf32:
;CHECK: vrsqrts.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vrsqrtsQf32:
;CHECK: vrsqrts.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone

@ -1,39 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
;CHECK: vrecpei32:
;CHECK: vrecpe.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind {
;CHECK: vrecpeQi32:
;CHECK: vrecpe.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
;CHECK: vrecpef32:
;CHECK: vrecpe.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
;CHECK: vrecpeQf32:
;CHECK: vrecpe.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone

@ -1,22 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vrecpsf32:
;CHECK: vrecps.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vrecpsQf32:
;CHECK: vrecps.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone

@ -1,125 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrhadds8:
;CHECK: vrhadd.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrhadds16:
;CHECK: vrhadd.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrhadds32:
;CHECK: vrhadd.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrhaddu8:
;CHECK: vrhadd.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrhaddu16:
;CHECK: vrhadd.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrhaddu32:
;CHECK: vrhadd.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrhaddQs8:
;CHECK: vrhadd.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrhaddQs16:
;CHECK: vrhadd.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrhaddQs32:
;CHECK: vrhadd.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrhaddQu8:
;CHECK: vrhadd.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrhaddQu16:
;CHECK: vrhadd.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrhaddQu32:
;CHECK: vrhadd.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

@ -1,293 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrshls8:
;CHECK: vrshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrshls16:
;CHECK: vrshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrshls32:
;CHECK: vrshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vrshls64:
;CHECK: vrshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrshlu8:
;CHECK: vrshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrshlu16:
;CHECK: vrshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrshlu32:
;CHECK: vrshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vrshlu64:
;CHECK: vrshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrshlQs8:
;CHECK: vrshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrshlQs16:
;CHECK: vrshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrshlQs32:
;CHECK: vrshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vrshlQs64:
;CHECK: vrshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrshlQu8:
;CHECK: vrshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrshlQu16:
;CHECK: vrshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrshlQu32:
;CHECK: vrshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vrshlQu64:
;CHECK: vrshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
;CHECK: vrshrs8:
;CHECK: vrshr.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
;CHECK: vrshrs16:
;CHECK: vrshr.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
;CHECK: vrshrs32:
;CHECK: vrshr.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
;CHECK: vrshrs64:
;CHECK: vrshr.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
;CHECK: vrshru8:
;CHECK: vrshr.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
;CHECK: vrshru16:
;CHECK: vrshr.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
;CHECK: vrshru32:
;CHECK: vrshr.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
;CHECK: vrshru64:
;CHECK: vrshr.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
;CHECK: vrshrQs8:
;CHECK: vrshr.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
;CHECK: vrshrQs16:
;CHECK: vrshr.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
;CHECK: vrshrQs32:
;CHECK: vrshr.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
;CHECK: vrshrQs64:
;CHECK: vrshr.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
;CHECK: vrshrQu8:
;CHECK: vrshr.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
;CHECK: vrshrQu16:
;CHECK: vrshr.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
;CHECK: vrshrQu32:
;CHECK: vrshr.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
;CHECK: vrshrQu64:
;CHECK: vrshr.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,29 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
;CHECK: vrshrns8:
;CHECK: vrshrn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind {
;CHECK: vrshrns16:
;CHECK: vrshrn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind {
;CHECK: vrshrns32:
;CHECK: vrshrn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,39 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind {
;CHECK: vrsqrtei32:
;CHECK: vrsqrte.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind {
;CHECK: vrsqrteQi32:
;CHECK: vrsqrte.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
;CHECK: vrsqrtef32:
;CHECK: vrsqrte.f32
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
;CHECK: vrsqrteQf32:
;CHECK: vrsqrte.f32
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone

@ -1,22 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
;CHECK: vrsqrtsf32:
;CHECK: vrsqrts.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
;CHECK: vrsqrtsQf32:
;CHECK: vrsqrts.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone

@ -1,32 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrsubhni16:
;CHECK: vrsubhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrsubhni32:
;CHECK: vrsubhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vrsubhni64:
;CHECK: vrsubhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,58 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
;CHECK: vset_lane8:
;CHECK: vmov.8
%tmp1 = load <8 x i8>* %A
%tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
ret <8 x i8> %tmp2
}
define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
;CHECK: vset_lane16:
;CHECK: vmov.16
%tmp1 = load <4 x i16>* %A
%tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
ret <4 x i16> %tmp2
}
define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
;CHECK: vset_lane32:
;CHECK: vmov.32
%tmp1 = load <2 x i32>* %A
%tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
ret <2 x i32> %tmp2
}
define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
;CHECK: vsetQ_lane8:
;CHECK: vmov.8
%tmp1 = load <16 x i8>* %A
%tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
ret <16 x i8> %tmp2
}
define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
;CHECK: vsetQ_lane16:
;CHECK: vmov.16
%tmp1 = load <8 x i16>* %A
%tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
ret <8 x i16> %tmp2
}
define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
;CHECK: vsetQ_lane32:
;CHECK: vmov.32
%tmp1 = load <4 x i32>* %A
%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
ret <4 x i32> %tmp2
}
define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind {
;CHECK: test_vset_lanef32:
;CHECK: fcpys
;CHECK: fcpys
entry:
%0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1]
ret <2 x float> %0
}

@ -280,6 +280,13 @@ define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind {
ret <2 x i64> %tmp2
}
; Example that requires splitting and expanding a vector shift.
define <2 x i64> @update(<2 x i64> %val) nounwind readnone {
entry:
%shr = lshr <2 x i64> %val, < i64 2, i64 2 > ; <<2 x i64>> [#uses=1]
ret <2 x i64> %shr
}
define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vashrs8:
;CHECK: vneg.s8

@ -1,8 +0,0 @@
; RUN: llc < %s -march=arm -mattr=-neon
; Example that requires splitting and expanding a vector shift.
define <2 x i64> @update(<2 x i64> %val) nounwind readnone {
entry:
%shr = lshr <2 x i64> %val, < i64 2, i64 2 > ; <<2 x i64>> [#uses=1]
ret <2 x i64> %shr
}

@ -360,3 +360,295 @@ declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind re
declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrshls8:
;CHECK: vrshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrshls16:
;CHECK: vrshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrshls32:
;CHECK: vrshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vrshls64:
;CHECK: vrshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vrshlu8:
;CHECK: vrshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vrshlu16:
;CHECK: vrshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vrshlu32:
;CHECK: vrshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
;CHECK: vrshlu64:
;CHECK: vrshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrshlQs8:
;CHECK: vrshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrshlQs16:
;CHECK: vrshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrshlQs32:
;CHECK: vrshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vrshlQs64:
;CHECK: vrshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vrshlQu8:
;CHECK: vrshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrshlQu16:
;CHECK: vrshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrshlQu32:
;CHECK: vrshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vrshlQu64:
;CHECK: vrshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
;CHECK: vrshrs8:
;CHECK: vrshr.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
;CHECK: vrshrs16:
;CHECK: vrshr.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
;CHECK: vrshrs32:
;CHECK: vrshr.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
;CHECK: vrshrs64:
;CHECK: vrshr.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
;CHECK: vrshru8:
;CHECK: vrshr.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
;CHECK: vrshru16:
;CHECK: vrshr.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
;CHECK: vrshru32:
;CHECK: vrshr.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
;CHECK: vrshru64:
;CHECK: vrshr.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
;CHECK: vrshrQs8:
;CHECK: vrshr.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
;CHECK: vrshrQs16:
;CHECK: vrshr.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
;CHECK: vrshrQs32:
;CHECK: vrshr.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
;CHECK: vrshrQs64:
;CHECK: vrshr.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
;CHECK: vrshrQu8:
;CHECK: vrshr.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
;CHECK: vrshrQu16:
;CHECK: vrshr.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
;CHECK: vrshrQu32:
;CHECK: vrshr.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
;CHECK: vrshrQu64:
;CHECK: vrshr.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

@ -27,3 +27,31 @@ define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
;CHECK: vrshrns8:
;CHECK: vrshrn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind {
;CHECK: vrshrns16:
;CHECK: vrshrn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind {
;CHECK: vrshrns32:
;CHECK: vrshrn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -89,3 +89,189 @@ define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp3 = sub <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}
define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vsubhni16:
;CHECK: vsubhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vsubhni32:
;CHECK: vsubhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vsubhni64:
;CHECK: vsubhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vrsubhni16:
;CHECK: vrsubhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vrsubhni32:
;CHECK: vrsubhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vrsubhni64:
;CHECK: vrsubhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubls8:
;CHECK: vsubl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubls16:
;CHECK: vsubl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubls32:
;CHECK: vsubl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsublu8:
;CHECK: vsubl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsublu16:
;CHECK: vsubl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsublu32:
;CHECK: vsubl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubws8:
;CHECK: vsubw.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubws16:
;CHECK: vsubw.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubws32:
;CHECK: vsubw.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubwu8:
;CHECK: vsubw.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubwu16:
;CHECK: vsubw.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubwu32:
;CHECK: vsubw.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone

@ -1,32 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vsubhni16:
;CHECK: vsubhn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vsubhni32:
;CHECK: vsubhn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
;CHECK: vsubhni64:
;CHECK: vsubhn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

@ -1,63 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubls8:
;CHECK: vsubl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubls16:
;CHECK: vsubl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubls32:
;CHECK: vsubl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsublu8:
;CHECK: vsubl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsublu16:
;CHECK: vsubl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsublu32:
;CHECK: vsubl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

@ -1,63 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubws8:
;CHECK: vsubw.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubws16:
;CHECK: vsubw.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubws32:
;CHECK: vsubw.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
;CHECK: vsubwu8:
;CHECK: vsubw.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
;CHECK: vsubwu16:
;CHECK: vsubw.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
;CHECK: vsubwu32:
;CHECK: vsubw.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone

@ -1,67 +0,0 @@
; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vtsti8:
;CHECK: vtst.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = and <8 x i8> %tmp1, %tmp2
%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
%tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
ret <8 x i8> %tmp5
}
define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vtsti16:
;CHECK: vtst.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = and <4 x i16> %tmp1, %tmp2
%tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
%tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
ret <4 x i16> %tmp5
}
define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vtsti32:
;CHECK: vtst.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = and <2 x i32> %tmp1, %tmp2
%tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
%tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
ret <2 x i32> %tmp5
}
define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
;CHECK: vtstQi8:
;CHECK: vtst.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = and <16 x i8> %tmp1, %tmp2
%tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
%tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
ret <16 x i8> %tmp5
}
define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
;CHECK: vtstQi16:
;CHECK: vtst.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = and <8 x i16> %tmp1, %tmp2
%tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
%tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
ret <8 x i16> %tmp5
}
define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
;CHECK: vtstQi32:
;CHECK: vtst.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = and <4 x i32> %tmp1, %tmp2
%tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
%tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
ret <4 x i32> %tmp5
}