Add intrinsics @llvm.arm.neon.vmulls and @llvm.arm.neon.vmullu.* back. Frontends

was lowering them to sext / uxt + mul instructions. Unfortunately the
optimization passes may hoist the extensions out of the loop and separate them.
When that happens, the long multiplication instructions can be broken into
several scalar instructions, causing significant performance issue.

Note the vmla and vmls intrinsics are not added back. Frontend will codegen them
as intrinsics vmull* + add / sub. Also note the isel optimizations for catching
mul + sext / zext are not changed either.

First part of rdar://8832507, rdar://9203134


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@128502 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2011-03-29 23:06:19 +00:00
parent 75c7563f83
commit 92e3916c3b
5 changed files with 113 additions and 12 deletions

View File

@ -129,8 +129,12 @@ let Properties = [IntrNoMem, Commutative] in {
def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
// Vector Multiply and Accumulate/Subtract.
def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
@ -292,7 +296,7 @@ def int_arm_neon_vcvtfp2hf
def int_arm_neon_vcvthf2fp
: Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
// Narrowing Saturating Vector Moves.
// Narrowing and Lengthening Vector Moves.
def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;

View File

@ -2157,6 +2157,13 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
}
return Result;
}
case Intrinsic::arm_neon_vmulls:
case Intrinsic::arm_neon_vmullu: {
unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
? ARMISD::VMULLs : ARMISD::VMULLu;
return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
}
}

View File

@ -84,7 +84,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name.compare(14, 5, "vsubl", 5) == 0 ||
Name.compare(14, 5, "vaddw", 5) == 0 ||
Name.compare(14, 5, "vsubw", 5) == 0 ||
Name.compare(14, 5, "vmull", 5) == 0 ||
Name.compare(14, 5, "vmlal", 5) == 0 ||
Name.compare(14, 5, "vmlsl", 5) == 0 ||
Name.compare(14, 5, "vabdl", 5) == 0 ||

View File

@ -76,20 +76,13 @@
; CHECK: zext <4 x i16>
; CHECK-NEXT: sub <4 x i32>
; vmull should be auto-upgraded to multiply with sext/zext
; (but vmullp should remain an intrinsic)
; vmull* intrinsics will remain intrinsics
; CHECK: vmulls8
; CHECK-NOT: arm.neon.vmulls.v8i16
; CHECK: sext <8 x i8>
; CHECK-NEXT: sext <8 x i8>
; CHECK-NEXT: mul <8 x i16>
; CHECK: arm.neon.vmulls.v8i16
; CHECK: vmullu16
; CHECK-NOT: arm.neon.vmullu.v4i32
; CHECK: zext <4 x i16>
; CHECK-NEXT: zext <4 x i16>
; CHECK-NEXT: mul <4 x i32>
; CHECK: arm.neon.vmullu.v4i32
; CHECK: vmullp8
; CHECK: arm.neon.vmullp.v8i16

View File

@ -158,6 +158,15 @@ define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
ret <8 x i16> %tmp5
}
define <8 x i16> @vmulls8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmulls8_int:
;CHECK: vmull.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmulls16:
;CHECK: vmull.s16
@ -169,6 +178,15 @@ define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
ret <4 x i32> %tmp5
}
define <4 x i32> @vmulls16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmulls16_int:
;CHECK: vmull.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmulls32:
;CHECK: vmull.s32
@ -180,6 +198,15 @@ define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
ret <2 x i64> %tmp5
}
define <2 x i64> @vmulls32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmulls32_int:
;CHECK: vmull.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullu8:
;CHECK: vmull.u8
@ -191,6 +218,15 @@ define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
ret <8 x i16> %tmp5
}
define <8 x i16> @vmullu8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullu8_int:
;CHECK: vmull.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmullu16:
;CHECK: vmull.u16
@ -202,6 +238,15 @@ define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
ret <4 x i32> %tmp5
}
define <4 x i32> @vmullu16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind {
;CHECK: vmullu16_int:
;CHECK: vmull.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmullu32:
;CHECK: vmull.u32
@ -213,6 +258,15 @@ define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
ret <2 x i64> %tmp5
}
define <2 x i64> @vmullu32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind {
;CHECK: vmullu32_int:
;CHECK: vmull.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
;CHECK: vmullp8:
;CHECK: vmull.p8
@ -233,6 +287,15 @@ entry:
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16_int(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes16_int
; CHECK: vmull.s16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes32
@ -244,6 +307,15 @@ entry:
ret <2 x i64> %3
}
define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32_int(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_lanes32_int
; CHECK: vmull.s32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu16
@ -255,6 +327,15 @@ entry:
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16_int(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu16_int
; CHECK: vmull.u16 q0, d0, d1[1]
%0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
%1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu32
@ -266,6 +347,23 @@ entry:
ret <2 x i64> %3
}
define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32_int(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
entry:
; CHECK: test_vmull_laneu32_int
; CHECK: vmull.u32 q0, d0, d1[1]
%0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
%1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %1
}
declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone