From dc2d418dd29ad9396aea06f2b72c9a7d29b30940 Mon Sep 17 00:00:00 2001 From: Jim Grosbach Date: Mon, 8 Jul 2013 18:18:52 +0000 Subject: [PATCH] ARM: Improve codegen for generic vselect. Fall back to by-element insert rather than building it up on the stack. rdar://14351991 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185846 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 18 ++++++++++++ test/CodeGen/ARM/vext.ll | 23 +++++++++++---- test/CodeGen/ARM/vselect_imax.ll | 46 ++++++++++++++---------------- 3 files changed, 57 insertions(+), 30 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 991a703f818..8c4a3f13d13 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -4734,6 +4734,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); + } + return Vec; + } + return SDValue(); } diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll index f404eb8be5b..ef22a3ba534 100644 --- a/test/CodeGen/ARM/vext.ll +++ b/test/CodeGen/ARM/vext.ll @@ -136,20 +136,26 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; We should ignore a build_vector with more than two sources. ; Use illegal <32 x i16> type to produce such a shuffle after legalizing types. -; Try to look for fallback to stack expansion. +; Try to look for fallback to by-element inserts. define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind { ;CHECK: test_multisource: -;CHECK: vst1.16 +;CHECK: vmov.16 [[REG:d[0-9]+]][0] +;CHECK: vmov.16 [[REG]][1] +;CHECK: vmov.16 [[REG]][2] +;CHECK: vmov.16 [[REG]][3] %tmp1 = load <32 x i16>* %B %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 } ; We don't handle shuffles using more than half of a 128-bit vector. -; Again, test for fallback to stack expansion +; Again, test for fallback to by-element inserts. define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind { ;CHECK: test_largespan: -;CHECK: vst1.16 +;CHECK: vmov.16 [[REG:d[0-9]+]][0] +;CHECK: vmov.16 [[REG]][1] +;CHECK: vmov.16 [[REG]][2] +;CHECK: vmov.16 [[REG]][3] %tmp1 = load <8 x i16>* %B %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> ret <4 x i16> %tmp2 @@ -160,7 +166,14 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind { ; lowering loop can result otherwise). define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: test_illegal: -;CHECK: vst1.16 +;CHECK: vmov.16 [[REG:d[0-9]+]][0] +;CHECK: vmov.16 [[REG]][1] +;CHECK: vmov.16 [[REG]][2] +;CHECK: vmov.16 [[REG]][3] +;CHECK: vmov.16 [[REG2:d[0-9]+]][0] +;CHECK: vmov.16 [[REG2]][1] +;CHECK: vmov.16 [[REG2]][2] +;CHECK: vmov.16 [[REG2]][3] %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> diff --git a/test/CodeGen/ARM/vselect_imax.ll b/test/CodeGen/ARM/vselect_imax.ll index 7e79d6c68c2..9744f4dde88 100644 --- a/test/CodeGen/ARM/vselect_imax.ll +++ b/test/CodeGen/ARM/vselect_imax.ll @@ -1,3 +1,4 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -march=arm -mcpu=cortex-a8 | FileCheck %s --check-prefix=COST ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s ; Make sure that ARM backend with NEON handles vselect. @@ -20,11 +21,8 @@ define void @func_blend10(%T0_10* %loadaddr, %T0_10* %loadaddr2, %v0 = load %T0_10* %loadaddr %v1 = load %T0_10* %loadaddr2 %c = icmp slt %T0_10 %v0, %v1 -; CHECK: vst1 -; CHECK: vst1 -; CHECK: vst1 -; CHECK: vst1 -; CHECK: vld +; CHECK: vbsl +; CHECK: vbsl ; COST: func_blend10 ; COST: cost of 40 {{.*}} select %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1 @@ -39,10 +37,8 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2, %v0 = load %T0_14* %loadaddr %v1 = load %T0_14* %loadaddr2 %c = icmp slt %T0_14 %v0, %v1 -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb +; CHECK: vbsl +; CHECK: vbsl ; COST: func_blend14 ; COST: cost of 41 {{.*}} select %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1 @@ -54,13 +50,11 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2, ; CHECK: func_blend15: define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2, %T1_15* %blend, %T0_15* %storeaddr) { +; CHECK: vbsl +; CHECK: vbsl %v0 = load %T0_15* %loadaddr %v1 = load %T0_15* %loadaddr2 %c = icmp slt %T0_15 %v0, %v1 -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb ; COST: func_blend15 ; COST: cost of 82 {{.*}} select %r = select %T1_15 %c, %T0_15 %v0, %T0_15 %v1 @@ -72,13 +66,11 @@ define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2, ; CHECK: func_blend18: define void @func_blend18(%T0_18* %loadaddr, %T0_18* %loadaddr2, %T1_18* %blend, %T0_18* %storeaddr) { +; CHECK: vbsl +; CHECK: vbsl %v0 = load %T0_18* %loadaddr %v1 = load %T0_18* %loadaddr2 %c = icmp slt %T0_18 %v0, %v1 -; CHECK: strh -; CHECK: strh -; CHECK: strh -; CHECK: strh ; COST: func_blend18 ; COST: cost of 19 {{.*}} select %r = select %T1_18 %c, %T0_18 %v0, %T0_18 %v1 @@ -90,13 +82,13 @@ define void @func_blend18(%T0_18* %loadaddr, %T0_18* %loadaddr2, ; CHECK: func_blend19: define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2, %T1_19* %blend, %T0_19* %storeaddr) { +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl %v0 = load %T0_19* %loadaddr %v1 = load %T0_19* %loadaddr2 %c = icmp slt %T0_19 %v0, %v1 -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb ; COST: func_blend19 ; COST: cost of 50 {{.*}} select %r = select %T1_19 %c, %T0_19 %v0, %T0_19 %v1 @@ -108,13 +100,17 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2, ; CHECK: func_blend20: define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2, %T1_20* %blend, %T0_20* %storeaddr) { +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl +; CHECK: vbsl %v0 = load %T0_20* %loadaddr %v1 = load %T0_20* %loadaddr2 %c = icmp slt %T0_20 %v0, %v1 -; CHECK: strb -; CHECK: strb -; CHECK: strb -; CHECK: strb ; COST: func_blend20 ; COST: cost of 100 {{.*}} select %r = select %T1_20 %c, %T0_20 %v0, %T0_20 %v1