From 9f6c4c141ffa9c8b13e90dce2f2285c4479ff403 Mon Sep 17 00:00:00 2001
From: Bob Wilson <bob.wilson@apple.com>
Date: Thu, 18 Feb 2010 06:05:53 +0000
Subject: [PATCH] Use NEON vmin/vmax instructions for floating-point selects.
 Radar 7461718.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@96572 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/ARMISelLowering.cpp | 93 +++++++++++++++++++++++++++---
 lib/Target/ARM/ARMISelLowering.h   |  6 +-
 lib/Target/ARM/ARMInstrNEON.td     | 19 ++++++
 test/CodeGen/ARM/neon_minmax.ll    | 65 +++++++++++++++++++++
 4 files changed, 174 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/ARM/neon_minmax.ll
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index adf16442e8e..ecc8289ff77 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -294,6 +294,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
     setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::SELECT_CC);
   }
 
   computeRegisterProperties();
@@ -544,6 +545,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::FMAX:          return "ARMISD::FMAX";
+  case ARMISD::FMIN:          return "ARMISD::FMIN";
   }
 }
 
@@ -3856,23 +3859,97 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
+/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
+static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) {
+  // If the target supports NEON, try to use vmax/vmin instructions for f32
+  // selects like "x < y ? x : y".  Unless the FiniteOnlyFPMath option is set,
+  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
+  // a NaN; only do the transformation when it matches that behavior.
+
+  // For now only do this when using NEON for FP operations; if using VFP, it
+  // is not obvious that the benefit outweighs the cost of switching to the
+  // NEON pipeline.
+  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
+      N->getValueType(0) != MVT::f32)
+    return SDValue();
+
+  SDValue CondLHS = N->getOperand(0);
+  SDValue CondRHS = N->getOperand(1);
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  unsigned Opcode = 0;
+  bool IsReversed;
+  if (LHS == CondLHS && RHS == CondRHS) {
+    IsReversed = false; // x CC y ? x : y
+  } else if (LHS == CondRHS && RHS == CondLHS) {
+    IsReversed = true ; // x CC y ? y : x
+  } else {
+    return SDValue();
+  }
+
+  switch (CC) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    // This can be vmin if we can prove that the LHS is not a NaN.
+    // (If either operand is NaN, the comparison will be false and the result
+    // will be the RHS, which matches vmin if RHS is the NaN.)
+    if (DAG.isKnownNeverNaN(LHS))
+      Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
+    break;
+
+  case ISD::SETULT:
+  case ISD::SETULE:
+    // Likewise, for ULT/ULE we need to know that RHS is not a NaN.
+    if (DAG.isKnownNeverNaN(RHS))
+      Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
+    break;
+
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+    // This can be vmax if we can prove that the LHS is not a NaN.
+    // (If either operand is NaN, the comparison will be false and the result
+    // will be the RHS, which matches vmax if RHS is the NaN.)
+    if (DAG.isKnownNeverNaN(LHS))
+      Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
+    break;
+
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // Likewise, for UGT/UGE we need to know that RHS is not a NaN.
+    if (DAG.isKnownNeverNaN(RHS))
+      Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
+    break;
+  }
+
+  if (!Opcode)
+    return SDValue();
+  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
+}
+
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADD:      return PerformADDCombine(N, DCI);
-  case ISD::SUB:      return PerformSUBCombine(N, DCI);
+  case ISD::ADD:        return PerformADDCombine(N, DCI);
+  case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
   case ISD::SRA:
-  case ISD::SRL:
-    return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND:
-    return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
   }
   return SDValue();
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 3c5df45dc55..f8f8adc70af 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -131,7 +131,11 @@ namespace llvm {
       VREV16,       // reverse elements within 16-bit halfwords
       VZIP,         // zip (interleave)
       VUZP,         // unzip (deinterleave)
-      VTRN          // transpose
+      VTRN,         // transpose
+
+      // Floating-point max and min:
+      FMAX,
+      FMIN
     };
   }
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index f981572c76c..99e81970657 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -89,6 +89,11 @@ def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
 def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
 def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
 
+def SDTARMFMAX    : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>]>;
+def NEONfmax      : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
+def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
+
 //===----------------------------------------------------------------------===//
 // NEON operand definitions
 //===----------------------------------------------------------------------===//
@@ -3023,6 +3028,20 @@ def  VNEGfd_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
                       "vneg", "f32", "$dst, $src", "", []>;
 def : N2VSPat<fneg, f32, v2f32, VNEGfd_sfp>;
 
+// Vector Maximum used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMAXfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     "vmax", "f32", "$dst, $src1, $src2", "", []>;
+def : N3VSPat<NEONfmax, VMAXfd_sfp>;
+
+// Vector Minimum used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMINfd_sfp : N3V<0, 0, 0b00, 0b1111, 0, 0, (outs DPR_VFP2:$dst),
+                     (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+                     "vmin", "f32", "$dst, $src1, $src2", "", []>;
+def : N3VSPat<NEONfmin, VMINfd_sfp>;
+
 // Vector Convert between single-precision FP and integer
 let neverHasSideEffects = 1 in
 def  VCVTf2sd_sfp : N2VS<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
diff --git a/test/CodeGen/ARM/neon_minmax.ll b/test/CodeGen/ARM/neon_minmax.ll
new file mode 100644
index 00000000000..64349d650e9
--- /dev/null
+++ b/test/CodeGen/ARM/neon_minmax.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+
+define float @fmin_ole(float %x) nounwind {
+;CHECK: fmin_ole:
+;CHECK: vmin.f32
+  %cond = fcmp ole float 1.0, %x
+  %min1 = select i1 %cond, float 1.0, float %x
+  ret float %min1
+}
+
+define float @fmin_ult(float %x) nounwind {
+;CHECK: fmin_ult:
+;CHECK: vmin.f32
+  %cond = fcmp ult float %x, 1.0
+  %min1 = select i1 %cond, float %x, float 1.0
+  ret float %min1
+}
+
+define float @fmax_ogt(float %x) nounwind {
+;CHECK: fmax_ogt:
+;CHECK: vmax.f32
+  %cond = fcmp ogt float 1.0, %x
+  %max1 = select i1 %cond, float 1.0, float %x
+  ret float %max1
+}
+
+define float @fmax_uge(float %x) nounwind {
+;CHECK: fmax_uge:
+;CHECK: vmax.f32
+  %cond = fcmp uge float %x, 1.0
+  %max1 = select i1 %cond, float %x, float 1.0
+  ret float %max1
+}
+
+define float @fmax_olt_reverse(float %x) nounwind {
+;CHECK: fmax_olt_reverse:
+;CHECK: vmax.f32
+  %cond = fcmp olt float %x, 1.0
+  %max1 = select i1 %cond, float 1.0, float %x
+  ret float %max1
+}
+
+define float @fmax_ule_reverse(float %x) nounwind {
+;CHECK: fmax_ule_reverse:
+;CHECK: vmax.f32
+  %cond = fcmp ult float 1.0, %x
+  %max1 = select i1 %cond, float %x, float 1.0
+  ret float %max1
+}
+
+define float @fmin_oge_reverse(float %x) nounwind {
+;CHECK: fmin_oge_reverse:
+;CHECK: vmin.f32
+  %cond = fcmp oge float %x, 1.0
+  %min1 = select i1 %cond, float 1.0, float %x
+  ret float %min1
+}
+
+define float @fmin_ugt_reverse(float %x) nounwind {
+;CHECK: fmin_ugt_reverse:
+;CHECK: vmin.f32
+  %cond = fcmp ugt float 1.0, %x
+  %min1 = select i1 %cond, float %x, float 1.0
+  ret float %min1
+}