From 04a359f768a1405461cb6cdb93bb5859c7814e54 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Wed, 7 May 2014 14:10:27 +0000 Subject: [PATCH] AArch64/ARM64: optimise vector selects & enable test When performing a scalar comparison that feeds into a vector select, it's actually better to do the comparison on the vector side: the scalar route would be "CMP -> CSEL -> DUP", the vector is "CM -> DUP" since the vector comparisons are all mask based. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208210 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM64/ARM64ISelLowering.cpp | 41 ++++ test/CodeGen/AArch64/neon-select_cc.ll | 2 +- test/CodeGen/ARM64/aarch64-neon-select_cc.ll | 206 +++++++++++++++++++ 3 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/ARM64/aarch64-neon-select_cc.ll diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp index 84bd52d976d..4c6d7648d57 100644 --- a/lib/Target/ARM64/ARM64ISelLowering.cpp +++ b/lib/Target/ARM64/ARM64ISelLowering.cpp @@ -366,6 +366,7 @@ ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM) setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; @@ -7121,6 +7122,44 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { IfTrue, IfFalse); } +/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with +/// the compare-mask instructions rather than going via NZCV, even if LHS and +/// RHS are really scalar. This replaces any scalar setcc in the above pattern +/// with a vector one followed by a DUP shuffle on the result. +static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + EVT ResVT = N->getValueType(0); + + if (!N->getOperand(1).getValueType().isVector()) + return SDValue(); + + if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) + return SDValue(); + + SDLoc DL(N0); + + EVT SrcVT = N0.getOperand(0).getValueType(); + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, + ResVT.getSizeInBits() / SrcVT.getSizeInBits()); + EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); + + // First perform a vector comparison, where lane 0 is the one we're interested + // in. + SDValue LHS = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); + SDValue RHS = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); + SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); + + // Now duplicate the comparison mask we want across all other lanes. + SmallVector DUPMask(CCVT.getVectorNumElements(), 0); + SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); + Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), + Mask); + + return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); +} + SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -7149,6 +7188,8 @@ SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); + case ISD::SELECT: + return performSelectCombine(N, DAG); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::STORE: diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll index f6b5d3ca57d..54103dbaaa2 100644 --- a/test/CodeGen/AArch64/neon-select_cc.ll +++ b/test/CodeGen/AArch64/neon-select_cc.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s - +; arm64 has separate copy of this test due to different codegen. define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) { ; CHECK-LABEL: test_select_cc_v8i8_i8: ; CHECK: and w0, w0, #0xff diff --git a/test/CodeGen/ARM64/aarch64-neon-select_cc.ll b/test/CodeGen/ARM64/aarch64-neon-select_cc.ll new file mode 100644 index 00000000000..255b90dfa64 --- /dev/null +++ b/test/CodeGen/ARM64/aarch64-neon-select_cc.ll @@ -0,0 +1,206 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) { +; CHECK-LABEL: test_select_cc_v8i8_i8: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].8b, v[[LHS]].8b, v[[RHS]].8b +; CHECK: dup [[DUPMASK:v[0-9]+]].8b, [[MASK]].b[0] +; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b + %cmp31 = icmp eq i8 %a, %b + %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d + ret <8x i8> %e +} + +define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) { +; CHECK-LABEL: test_select_cc_v8i8_f32: +; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s +; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0] +; CHECK-NEXT: bsl [[DUPMASK]].8b, v2.8b, v3.8b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d + ret <8x i8> %e +} + +define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) { +; CHECK-LABEL: test_select_cc_v8i8_f64: +; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1 +; CHECK-NEXT: bsl v[[MASK]].8b, v2.8b, v3.8b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d + ret <8x i8> %e +} + +define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) { +; CHECK-LABEL: test_select_cc_v16i8_i8: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].16b, v[[LHS]].16b, v[[RHS]].16b +; CHECK: dup [[DUPMASK:v[0-9]+]].16b, [[MASK]].b[0] +; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b + %cmp31 = icmp eq i8 %a, %b + %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d + ret <16x i8> %e +} + +define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) { +; CHECK-LABEL: test_select_cc_v16i8_f32: +; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s +; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0] +; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d + ret <16x i8> %e +} + +define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) { +; CHECK-LABEL: test_select_cc_v16i8_f64: +; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d +; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0] +; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d + ret <16x i8> %e +} + +define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) { +; CHECK-LABEL: test_select_cc_v4i16: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].4h, v[[LHS]].4h, v[[RHS]].4h +; CHECK: dup [[DUPMASK:v[0-9]+]].4h, [[MASK]].h[0] +; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b + %cmp31 = icmp eq i16 %a, %b + %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d + ret <4x i16> %e +} + +define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) { +; CHECK-LABEL: test_select_cc_v8i16: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].8h, v[[LHS]].8h, v[[RHS]].8h +; CHECK: dup [[DUPMASK:v[0-9]+]].8h, [[MASK]].h[0] +; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b + %cmp31 = icmp eq i16 %a, %b + %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d + ret <8x i16> %e +} + +define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) { +; CHECK-LABEL: test_select_cc_v2i32: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].2s, v[[LHS]].2s, v[[RHS]].2s +; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0] +; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b + %cmp31 = icmp eq i32 %a, %b + %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d + ret <2x i32> %e +} + +define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) { +; CHECK-LABEL: test_select_cc_v4i32: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s +; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0] +; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b + %cmp31 = icmp eq i32 %a, %b + %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d + ret <4x i32> %e +} + +define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) { +; CHECK-LABEL: test_select_cc_v1i64: +; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0 +; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1 +; CHECK: cmeq d[[MASK:[0-9]+]], d[[LHS]], d[[RHS]] +; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b + %cmp31 = icmp eq i64 %a, %b + %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d + ret <1x i64> %e +} + +define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) { +; CHECK-LABEL: test_select_cc_v2i64: +; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0 +; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1 +; CHECK: cmeq [[MASK:v[0-9]+]].2d, v[[LHS]].2d, v[[RHS]].2d +; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0] +; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b + %cmp31 = icmp eq i64 %a, %b + %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d + ret <2x i64> %e +} + +define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) { +; CHECK-LABEL: test_select_cc_v1f32: +; CHECK: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s2, s3, eq + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d + ret <1 x float> %e +} + +define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) { +; CHECK-LABEL: test_select_cc_v2f32: +; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s +; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0] +; CHECK: bsl [[DUPMASK]].8b, v2.8b, v3.8b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d + ret <2 x float> %e +} + +define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) { +; CHECK-LABEL: test_select_cc_v4f32: +; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s +; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0] +; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b + %cmp31 = fcmp oeq float %a, %b + %e = select i1 %cmp31, <4x float> %c, <4x float> %d + ret <4x float> %e +} + +define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) { +; CHECK-LABEL: test_select_cc_v4f32_icmp: +; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0 +; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1 +; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s +; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0] +; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b + %cmp31 = icmp eq i32 %a, %b + %e = select i1 %cmp31, <4x float> %c, <4x float> %d + ret <4x float> %e +} + +define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) { +; CHECK-LABEL: test_select_cc_v1f64: +; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1 +; CHECK: bsl v[[MASK]].8b, v2.8b, v3.8b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d + ret <1 x double> %e +} + +define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) { +; CHECK-LABEL: test_select_cc_v1f64_icmp: +; CHECK-DAG: fmov [[LHS:d[0-9]+]], x0 +; CHECK-DAG: fmov [[RHS:d[0-9]+]], x1 +; CHECK: cmeq d[[MASK:[0-9]+]], [[LHS]], [[RHS]] +; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b + %cmp31 = icmp eq i64 %a, %b + %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d + ret <1 x double> %e +} + +define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) { +; CHECK-LABEL: test_select_cc_v2f64: +; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d +; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0] +; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b + %cmp31 = fcmp oeq double %a, %b + %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d + ret <2 x double> %e +}