From 9c0c6b2e4a403454c7c5105e18d9ffe1eef2f498 Mon Sep 17 00:00:00 2001
From: Scott Michel <scottm@aero.org>
Date: Fri, 21 Nov 2008 02:56:16 +0000
Subject: [PATCH] CellSPU: (a) Fix bgs 3052, 3057 (b) Incorporate Duncan's
 suggestions re: i1 promotion (c) Indentation updates.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@59790 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/CellSPU/SPUISelDAGToDAG.cpp | 58 +++++++++++++++++-----
 lib/Target/CellSPU/SPUISelLowering.cpp | 66 +++++++++++++-------------
 lib/Target/CellSPU/SPUInstrInfo.cpp    |  4 +-
 lib/Target/CellSPU/SPUInstrInfo.td     | 46 ++++++++++++++++--
 test/CodeGen/CellSPU/loads.ll          | 20 ++++++++
 test/CodeGen/CellSPU/stores.ll         | 22 +++++++++
 6 files changed, 165 insertions(+), 51 deletions(-)
 create mode 100644 test/CodeGen/CellSPU/loads.ll
 create mode 100644 test/CodeGen/CellSPU/stores.ll

diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 144f5781cda..109cd5ee1ee 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -430,8 +430,8 @@ bool
 SPUDAGToDAGISel::SelectDFormAddr(SDValue Op, SDValue N, SDValue &Base,
                                  SDValue &Index) {
   return DFormAddressPredicate(Op, N, Base, Index,
-                              SPUFrameInfo::minFrameOffset(),
-                              SPUFrameInfo::maxFrameOffset());
+                               SPUFrameInfo::minFrameOffset(),
+                               SPUFrameInfo::maxFrameOffset());
 }
 
 bool
@@ -544,7 +544,35 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
     Base = CurDAG->getTargetConstant(0, N.getValueType());
     Index = N;
     return true;
+  } else if (Opc == ISD::Register || Opc == ISD::CopyFromReg) {
+    unsigned OpOpc = Op.getOpcode();
+
+    if (OpOpc == ISD::STORE || OpOpc == ISD::LOAD) {
+      // Direct load/store without getelementptr
+      SDValue Addr, Offs;
+
+      // Get the register from CopyFromReg
+      if (Opc == ISD::CopyFromReg)
+        Addr = N.getOperand(1);
+      else
+        Addr = N;                       // Register
+
+      if (OpOpc == ISD::STORE)
+        Offs = Op.getOperand(3);
+      else
+        Offs = Op.getOperand(2);        // LOAD
+
+      if (Offs.getOpcode() == ISD::Constant || Offs.getOpcode() == ISD::UNDEF) {
+        if (Offs.getOpcode() == ISD::UNDEF)
+          Offs = CurDAG->getTargetConstant(0, Offs.getValueType());
+
+        Base = Offs;
+        Index = Addr;
+        return true;
+      }
+    }
   }
+
   return false;
 }
 
@@ -554,21 +582,27 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base,
   \arg Base The base pointer operand
   \arg Index The offset/index operand
 
-  If the address \a N can be expressed as a [r + s10imm] address, returns false.
-  Otherwise, creates two operands, Base and Index that will become the [r+r]
-  address.
+  If the address \a N can be expressed as an A-form or D-form address, returns
+  false.  Otherwise, creates two operands, Base and Index that will become the
+  (r)(r) X-form address.
 */
 bool
 SPUDAGToDAGISel::SelectXFormAddr(SDValue Op, SDValue N, SDValue &Base,
                                  SDValue &Index) {
-  if (SelectAFormAddr(Op, N, Base, Index)
-      || SelectDFormAddr(Op, N, Base, Index))
-    return false;
+  if (!SelectAFormAddr(Op, N, Base, Index)
+      && !SelectDFormAddr(Op, N, Base, Index)) {
+    // default form of a X-form address is r(r) in operands 0 and 1:
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
 
-  // All else fails, punt and use an X-form address:
-  Base = N.getOperand(0);
-  Index = N.getOperand(1);
-  return true;
+    if (Op0.getOpcode() == ISD::Register && Op1.getOpcode() == ISD::Register) {
+      Base = Op0;
+      Index = Op1;
+      return true;
+    }
+  }
+
+  return false;
 }
 
 //! Convert the operand from a target-independent to a target-specific node
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 1cd00978eef..9f828b410b6 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -165,8 +165,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     setOperationAction(ISD::STORE, VT, Custom);
   }
 
-  // Custom lower BRCOND for i1, i8 to "promote" the result to
-  // i32 and i16, respectively.
+  // Custom lower BRCOND for i8 to "promote" the result to i16
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
   // Expand the jumptable branches
@@ -215,7 +214,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
-  // And SPU needs custom lowering for shift left/right for i64
+
+  // SPU needs custom lowering for shift left/right for i64
   setOperationAction(ISD::SHL,  MVT::i64,    Custom);
   setOperationAction(ISD::SRL,  MVT::i64,    Custom);
   setOperationAction(ISD::SRA,  MVT::i64,    Custom);
@@ -223,7 +223,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   // Custom lower i8, i32 and i64 multiplications
   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
   setOperationAction(ISD::MUL,  MVT::i32,    Custom);
-  setOperationAction(ISD::MUL,  MVT::i64,    Expand);
+  setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall
+
+  // SMUL_LOHI, UMUL_LOHI
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
 
   // Need to custom handle (some) common i8, i64 math ops
   setOperationAction(ISD::ADD,  MVT::i64,    Custom);
@@ -247,13 +253,11 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // SPU has a version of select that implements (a&~c)|(b&c), just like
   // select ought to work:
-  setOperationAction(ISD::SELECT, MVT::i1,   Promote);
   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
   setOperationAction(ISD::SELECT, MVT::i64,  Expand);
 
-  setOperationAction(ISD::SETCC, MVT::i1,    Promote);
   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
@@ -299,7 +303,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // We want to legalize GlobalAddress and ConstantPool nodes into the
   // appropriate instructions to materialize the address.
-  for (unsigned sctype = (unsigned) MVT::i1; sctype < (unsigned) MVT::f128;
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
        ++sctype) {
     MVT VT = (MVT::SimpleValueType)sctype;
 
@@ -699,8 +703,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
     int chunk_offset, slot_offset;
     bool was16aligned;
 
-    // The vector type we really want to load from the 16-byte chunk, except
-    // in the case of MVT::i1, which has to be v16i8.
+    // The vector type we really want to load from the 16-byte chunk.
     MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
         stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
 
@@ -908,7 +911,7 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-//! Lower MVT::i1, MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
+//! Lower MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
 static SDValue
 LowerBRCOND(SDValue Op, SelectionDAG &DAG)
 {
@@ -916,8 +919,8 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG)
   MVT CondVT = Cond.getValueType();
   MVT CondNVT;
 
-  if (CondVT == MVT::i1 || CondVT == MVT::i8) {
-    CondNVT = (CondVT == MVT::i1 ? MVT::i32 : MVT::i16);
+  if (CondVT == MVT::i8) {
+    CondNVT = MVT::i16;
     return DAG.getNode(ISD::BRCOND, Op.getValueType(),
                       Op.getOperand(0),
                       DAG.getNode(ISD::ZERO_EXTEND, CondNVT, Op.getOperand(1)),
@@ -957,37 +960,37 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
 
       switch (ObjectVT.getSimpleVT()) {
       default: {
-	cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-	     << ObjectVT.getMVTString()
-	     << "\n";
-	abort();
+        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << ObjectVT.getMVTString()
+             << "\n";
+        abort();
       }
       case MVT::i8:
-	ArgRegClass = &SPU::R8CRegClass;
-	break;
+        ArgRegClass = &SPU::R8CRegClass;
+        break;
       case MVT::i16:
-	ArgRegClass = &SPU::R16CRegClass;
-	break;
+        ArgRegClass = &SPU::R16CRegClass;
+        break;
       case MVT::i32:
-	ArgRegClass = &SPU::R32CRegClass;
-	break;
+        ArgRegClass = &SPU::R32CRegClass;
+        break;
       case MVT::i64:
-	ArgRegClass = &SPU::R64CRegClass;
-	break;
+        ArgRegClass = &SPU::R64CRegClass;
+        break;
       case MVT::f32:
-	ArgRegClass = &SPU::R32FPRegClass;
-	break;
+        ArgRegClass = &SPU::R32FPRegClass;
+        break;
       case MVT::f64:
-	ArgRegClass = &SPU::R64FPRegClass;
-	break;
+        ArgRegClass = &SPU::R64FPRegClass;
+        break;
       case MVT::v2f64:
       case MVT::v4f32:
       case MVT::v2i64:
       case MVT::v4i32:
       case MVT::v8i16:
       case MVT::v16i8:
-	ArgRegClass = &SPU::VECREGRegClass;
-	break;
+        ArgRegClass = &SPU::VECREGRegClass;
+        break;
       }
 
       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
@@ -2103,7 +2106,6 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
     // zero fill uppper part of preferred slot, don't care about the
     // other slots:
     unsigned int mask_val;
-
     if (i <= prefslot_end) {
       mask_val =
         ((i < prefslot_begin)
@@ -2884,7 +2886,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   }
   }
   // Otherwise, return unchanged.
-#if 1
+#ifdef NDEBUG
   if (Result.getNode()) {
     DEBUG(cerr << "\nReplace.SPU: ");
     DEBUG(N->dump(&DAG));
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index d10f2b8e52d..510b05091f7 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -161,7 +161,7 @@ SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case SPU::STQDr64:
   case SPU::STQDr32:
   case SPU::STQDr16:
-    // case SPU::STQDr8:
+  case SPU::STQDr8:
   case SPU::STQXv16i8:
   case SPU::STQXv8i16:
   case SPU::STQXv4i32:
@@ -171,7 +171,7 @@ SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case SPU::STQXr64:
   case SPU::STQXr32:
   case SPU::STQXr16:
-    // case SPU::STQXr8:
+  case SPU::STQXr8:
     if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
         MI->getOperand(2).isFI()) {
       FrameIndex = MI->getOperand(2).getIndex();
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 17472a2a3ce..990865df04e 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -3494,26 +3494,62 @@ def FIf32 :
       "fi\t$rT, $rA, $rB", SPrecFP,
       [(set R32FP:$rT, (SPUinterpolate R32FP:$rA, R32FP:$rB))]>;
 
-// Floating Compare Equal
+//--------------------------------------------------------------------------
+// Basic single precision floating point comparisons:
+//
+// Note: There is no support on SPU for single precision NaN. Consequently,
+// ordered and unordered comparisons are the same.
+//--------------------------------------------------------------------------
+
 def FCEQf32 :
     RRForm<0b01000011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
       "fceq\t$rT, $rA, $rB", SPrecFP,
-      [(set R32C:$rT, (setoeq R32FP:$rA, R32FP:$rB))]>;
+      [(set R32C:$rT, (setueq R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setoeq R32FP:$rA, R32FP:$rB),
+          (FCEQf32 R32FP:$rA, R32FP:$rB)>;
 
 def FCMEQf32 :
     RRForm<0b01010011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
       "fcmeq\t$rT, $rA, $rB", SPrecFP,
-      [(set R32C:$rT, (setoeq (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+      [(set R32C:$rT, (setueq (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setoeq (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMEQf32 R32FP:$rA, R32FP:$rB)>;
 
 def FCGTf32 :
     RRForm<0b01000011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
       "fcgt\t$rT, $rA, $rB", SPrecFP,
-      [(set R32C:$rT, (setogt R32FP:$rA, R32FP:$rB))]>;
+      [(set R32C:$rT, (setugt R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setugt R32FP:$rA, R32FP:$rB),
+          (FCGTf32 R32FP:$rA, R32FP:$rB)>;
 
 def FCMGTf32 :
     RRForm<0b01010011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
       "fcmgt\t$rT, $rA, $rB", SPrecFP,
-      [(set R32C:$rT, (setogt (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+      [(set R32C:$rT, (setugt (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setugt (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMGTf32 R32FP:$rA, R32FP:$rB)>;
+
+//--------------------------------------------------------------------------
+// Single precision floating point comparisons and SETCC equivalents:
+//--------------------------------------------------------------------------
+
+def : SETCCNegCondReg<setune, R32FP, i32, XORIr32, FCEQf32>;
+def : SETCCNegCondReg<setone, R32FP, i32, XORIr32, FCEQf32>;
+
+def : SETCCBinOpReg<setuge, R32FP, ORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setoge, R32FP, ORr32, FCGTf32, FCEQf32>;
+
+def : SETCCBinOpReg<setult, R32FP, NORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setolt, R32FP, NORr32, FCGTf32, FCEQf32>;
+
+def : Pat<(setule R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+def : Pat<(setole R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
 
 // FP Status and Control Register Write
 // Why isn't rT a don't care in the ISA?
diff --git a/test/CodeGen/CellSPU/loads.ll b/test/CodeGen/CellSPU/loads.ll
new file mode 100644
index 00000000000..3b9746c8080
--- /dev/null
+++ b/test/CodeGen/CellSPU/loads.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep {lqd.*0(\$3)}   %t1.s | count 1
+; RUN: grep {lqd.*16(\$3)}  %t1.s | count 1
+
+; ModuleID = 'loads.bc'
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+define <4 x float> @load_v4f32_1(<4 x float>* %a) nounwind readonly {
+entry:
+	%tmp1 = load <4 x float>* %a
+	ret <4 x float> %tmp1
+}
+
+define <4 x float> @load_v4f32_2(<4 x float>* %a) nounwind readonly {
+entry:
+	%arrayidx = getelementptr <4 x float>* %a, i32 1		; <<4 x float>*> [#uses=1]
+	%tmp1 = load <4 x float>* %arrayidx		; <<4 x float>> [#uses=1]
+	ret <4 x float> %tmp1
+}
diff --git a/test/CodeGen/CellSPU/stores.ll b/test/CodeGen/CellSPU/stores.ll
new file mode 100644
index 00000000000..b9534abfc1a
--- /dev/null
+++ b/test/CodeGen/CellSPU/stores.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep {stqd.*0(\$3)}   %t1.s | count 1
+; RUN: grep {stqd.*16(\$3)}  %t1.s | count 1
+; RUN: grep 16256            %t1.s | count 1
+; RUN: grep 16384            %t1.s | count 1
+
+; ModuleID = 'stores.bc'
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+define void @store_v4f32_1(<4 x float>* %a) nounwind {
+entry:
+	store <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x float>* %a
+	ret void
+}
+
+define void @store_v4f32_2(<4 x float>* %a) nounwind {
+entry:
+	%arrayidx = getelementptr <4 x float>* %a, i32 1
+	store <4 x float> < float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00 >, <4 x float>* %arrayidx
+	ret void
+}