From 30ee7df71c4b08da5d7e3f772f29f7c9ca57d8fa Mon Sep 17 00:00:00 2001 From: Scott Michel Date: Thu, 4 Dec 2008 03:02:42 +0000 Subject: [PATCH] CellSPU: - First patch from Nehal Desai, a new contributor at Aerospace. Nehal's patch fixes sign/zero/any-extending loads for integers and floating point. Example code, compiled w/o debugging or optimization where he first noticed the bug: int main(void) { float a = 99.0; printf("%d\n", a); return 0; } Verified that this code actually works on a Cell SPU. Changes by Scott Michel: - Fix bug in the value type list constructed by SPUISD::LDRESULT to include both the load result's result and chain, not just the chain alone. - Simplify LowerLOAD and remove extraneous and unnecessary chains. - Remove unused SPUISD pseudo instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@60526 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/CellSPU/SPUISelDAGToDAG.cpp | 2 +- lib/Target/CellSPU/SPUISelLowering.cpp | 120 ++++++++++++------------- lib/Target/CellSPU/SPUISelLowering.h | 6 -- lib/Target/CellSPU/SPUInstrInfo.td | 49 ---------- lib/Target/CellSPU/SPUNodes.td | 12 --- 5 files changed, 56 insertions(+), 133 deletions(-) diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index 8d65cf99518..14f3edd4d6a 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -676,7 +676,7 @@ SPUDAGToDAGISel::Select(SDValue Op) { Result = CurDAG->getTargetNode(Opc, VT, MVT::Other, Arg, Zero, Chain); } else { - Result = CurDAG->getTargetNode(Opc, MVT::Other, Arg, Arg, Chain); + Result = CurDAG->getTargetNode(Opc, VT, MVT::Other, Arg, Arg, Chain); } Chain = SDValue(Result, 1); diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 9913a8bc9eb..f8056484b7c 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -436,12 +436,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::VEC2PREFSLOT_CHAINED] - = "SPUISD::VEC2PREFSLOT_CHAINED"; - node_names[(unsigned) SPUISD::EXTRACT_I1_ZEXT] = "SPUISD::EXTRACT_I1_ZEXT"; - node_names[(unsigned) SPUISD::EXTRACT_I1_SEXT] = "SPUISD::EXTRACT_I1_SEXT"; - node_names[(unsigned) SPUISD::EXTRACT_I8_ZEXT] = "SPUISD::EXTRACT_I8_ZEXT"; - node_names[(unsigned) SPUISD::EXTRACT_I8_SEXT] = "SPUISD::EXTRACT_I8_SEXT"; node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY"; node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU"; node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH"; @@ -458,8 +452,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] = "SPUISD::ROTQUAD_RZ_BITS"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; - node_names[(unsigned) SPUISD::ROTBYTES_LEFT_CHAINED] = - "SPUISD::ROTBYTES_LEFT_CHAINED"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] = "SPUISD::ROTBYTES_LEFT_BITS"; node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK"; @@ -597,13 +589,24 @@ AlignedLoad(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST, /*! All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements within a 16-byte block, we have to rotate to extract the requested element. - */ + + For extending loads, we also want to ensure that the following sequence is + emitted, e.g. for MVT::f32 extending load to MVT::f64: + +\verbatim +%1 v16i8,ch = load +%2 v16i8,ch = rotate %1 +%3 v4f8, ch = bitconvert %2 +%4 f32 = vec2perfslot %3 +%5 f64 = fp_extend %4 +\endverbatim +*/ static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { LoadSDNode *LN = cast(Op); SDValue the_chain = LN->getChain(); - MVT VT = LN->getMemoryVT(); - MVT OpVT = Op.getNode()->getValueType(0); + MVT InVT = LN->getMemoryVT(); + MVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); SDValue Ops[8]; @@ -613,7 +616,8 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { int offset, rotamt; bool was16aligned; SDValue result = - AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, VT, was16aligned); + AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, InVT, + was16aligned); if (result.getNode() == 0) return result; @@ -625,57 +629,40 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { if (rotamt != 0 || !was16aligned) { SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other); - Ops[0] = the_chain; - Ops[1] = result; + Ops[0] = result; if (was16aligned) { - Ops[2] = DAG.getConstant(rotamt, MVT::i16); + Ops[1] = DAG.getConstant(rotamt, MVT::i16); } else { MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); LoadSDNode *LN1 = cast(result); - Ops[2] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(), + Ops[1] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(), DAG.getConstant(rotamt, PtrVT)); } - result = DAG.getNode(SPUISD::ROTBYTES_LEFT_CHAINED, vecvts, Ops, 3); - the_chain = result.getValue(1); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v16i8, Ops, 2); } - if (VT == OpVT || ExtType == ISD::EXTLOAD) { - SDVTList scalarvts; - MVT vecVT = MVT::v16i8; + // Convert the loaded v16i8 vector to the appropriate vector type + // specified by the operand: + MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits())); + result = DAG.getNode(SPUISD::VEC2PREFSLOT, InVT, + DAG.getNode(ISD::BIT_CONVERT, vecVT, result)); - // Convert the loaded v16i8 vector to the appropriate vector type - // specified by the operand: - if (OpVT == VT) { - if (VT != MVT::i1) - vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())); - } else - vecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits())); + // Handle extending loads by extending the scalar result: + if (ExtType == ISD::SEXTLOAD) { + result = DAG.getNode(ISD::SIGN_EXTEND, OutVT, result); + } else if (ExtType == ISD::ZEXTLOAD) { + result = DAG.getNode(ISD::ZERO_EXTEND, OutVT, result); + } else if (ExtType == ISD::EXTLOAD) { + unsigned NewOpc = ISD::ANY_EXTEND; - Ops[0] = the_chain; - Ops[1] = DAG.getNode(ISD::BIT_CONVERT, vecVT, result); - scalarvts = DAG.getVTList((OpVT == VT ? VT : OpVT), MVT::Other); - result = DAG.getNode(SPUISD::VEC2PREFSLOT_CHAINED, scalarvts, Ops, 2); - the_chain = result.getValue(1); - } else { - // Handle the sign and zero-extending loads for i1 and i8: - unsigned NewOpC; + if (OutVT.isFloatingPoint()) + NewOpc = ISD::FP_EXTEND; - if (ExtType == ISD::SEXTLOAD) { - NewOpC = (OpVT == MVT::i1 - ? SPUISD::EXTRACT_I1_SEXT - : SPUISD::EXTRACT_I8_SEXT); - } else { - assert(ExtType == ISD::ZEXTLOAD); - NewOpC = (OpVT == MVT::i1 - ? SPUISD::EXTRACT_I1_ZEXT - : SPUISD::EXTRACT_I8_ZEXT); - } - - result = DAG.getNode(NewOpC, OpVT, result); + result = DAG.getNode(NewOpc, OutVT, result); } - SDVTList retvts = DAG.getVTList(OpVT, MVT::Other); + SDVTList retvts = DAG.getVTList(OutVT, MVT::Other); SDValue retops[2] = { result, the_chain @@ -3034,10 +3021,16 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const SDValue combinedConst = DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(), Op0VT); - DEBUG(cerr << "Replace: (add " << CN0->getZExtValue() << ", " - << "(SPUindirect , " << CN1->getZExtValue() << "))\n"); - DEBUG(cerr << "With: (SPUindirect , " - << CN0->getZExtValue() + CN1->getZExtValue() << ")\n"); +#if defined(NDEBUG) + if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { + cerr << "\n" + << "Replace: (add " << CN0->getZExtValue() << ", " + << "(SPUindirect , " << CN1->getZExtValue() << "))\n" + << "With: (SPUindirect , " + << CN0->getZExtValue() + CN1->getZExtValue() << ")\n"; + } +#endif + return DAG.getNode(SPUISD::IndirectAddr, Op0VT, Op0.getOperand(0), combinedConst); } @@ -3071,11 +3064,14 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const // (any_extend (SPUextract_elt0 )) -> // (SPUextract_elt0 ) // Types must match, however... - DEBUG(cerr << "Replace: "); - DEBUG(N->dump(&DAG)); - DEBUG(cerr << "\nWith: "); - DEBUG(Op0.getNode()->dump(&DAG)); - DEBUG(cerr << "\n"); +#if defined(NDEBUG) + if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { + cerr << "\nReplace: "; + N->dump(&DAG); + cerr << "\nWith: "; + Op0.getNode()->dump(&DAG); + cerr << "\n"; +#endif return Op0; } @@ -3243,8 +3239,7 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } case SPUISD::LDRESULT: - case SPUISD::VEC2PREFSLOT: - case SPUISD::VEC2PREFSLOT_CHAINED: { + case SPUISD::VEC2PREFSLOT: { MVT OpVT = Op.getValueType(); unsigned OpVTBits = OpVT.getSizeInBits(); uint64_t InMask = OpVT.getIntegerVTBitMask(); @@ -3254,10 +3249,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } #if 0 - case EXTRACT_I1_ZEXT: - case EXTRACT_I1_SEXT: - case EXTRACT_I8_ZEXT: - case EXTRACT_I8_SEXT: case MPY: case MPYU: case MPYH: @@ -3272,7 +3263,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case SPUISD::ROTQUAD_RZ_BYTES: case SPUISD::ROTQUAD_RZ_BITS: case SPUISD::ROTBYTES_LEFT: - case SPUISD::ROTBYTES_LEFT_CHAINED: case SPUISD::SELECT_MASK: case SPUISD::SELB: case SPUISD::FPInterp: diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index fefaa683cba..dd1f97f8d35 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -41,11 +41,6 @@ namespace llvm { CNTB, ///< Count leading ones in bytes PROMOTE_SCALAR, ///< Promote scalar->vector VEC2PREFSLOT, ///< Extract element 0 - VEC2PREFSLOT_CHAINED, ///< Extract element 0, with chain - EXTRACT_I1_ZEXT, ///< Extract element 0 as i1, zero extend - EXTRACT_I1_SEXT, ///< Extract element 0 as i1, sign extend - EXTRACT_I8_ZEXT, ///< Extract element 0 as i8, zero extend - EXTRACT_I8_SEXT, ///< Extract element 0 as i8, sign extend MPY, ///< 16-bit Multiply (low parts of a 32-bit) MPYU, ///< Multiply Unsigned MPYH, ///< Multiply High @@ -60,7 +55,6 @@ namespace llvm { ROTQUAD_RZ_BYTES, ///< Rotate quad right, by bytes, zero fill ROTQUAD_RZ_BITS, ///< Rotate quad right, by bits, zero fill ROTBYTES_LEFT, ///< Rotate bytes (loads -> ROTQBYI) - ROTBYTES_LEFT_CHAINED, ///< Rotate bytes (loads -> ROTQBYI), with chain ROTBYTES_LEFT_BITS, ///< Rotate bytes left by bit shift count SELECT_MASK, ///< Select Mask (FSM, FSMB, FSMH, FSMBI) SELB, ///< Select bits -> (b & mask) | (a & ~mask) diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index 5d6d8af0cee..03f79d36ef4 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -1288,39 +1288,21 @@ def : Pat<(v2f64 (SPUpromote_scalar R64FP:$rA)), def : Pat<(SPUvec2prefslot (v16i8 VECREG:$rA)), (ORi8_v16i8 VECREG:$rA, VECREG:$rA)>; -def : Pat<(SPUvec2prefslot_chained (v16i8 VECREG:$rA)), - (ORi8_v16i8 VECREG:$rA, VECREG:$rA)>; - def : Pat<(SPUvec2prefslot (v8i16 VECREG:$rA)), (ORi16_v8i16 VECREG:$rA, VECREG:$rA)>; -def : Pat<(SPUvec2prefslot_chained (v8i16 VECREG:$rA)), - (ORi16_v8i16 VECREG:$rA, VECREG:$rA)>; - def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)), (ORi32_v4i32 VECREG:$rA, VECREG:$rA)>; -def : Pat<(SPUvec2prefslot_chained (v4i32 VECREG:$rA)), - (ORi32_v4i32 VECREG:$rA, VECREG:$rA)>; - def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)), (ORi64_v2i64 VECREG:$rA, VECREG:$rA)>; -def : Pat<(SPUvec2prefslot_chained (v2i64 VECREG:$rA)), - (ORi64_v2i64 VECREG:$rA, VECREG:$rA)>; - def : Pat<(SPUvec2prefslot (v4f32 VECREG:$rA)), (ORf32_v4f32 VECREG:$rA, VECREG:$rA)>; -def : Pat<(SPUvec2prefslot_chained (v4f32 VECREG:$rA)), - (ORf32_v4f32 VECREG:$rA, VECREG:$rA)>; - def : Pat<(SPUvec2prefslot (v2f64 VECREG:$rA)), (ORf64_v2f64 VECREG:$rA, VECREG:$rA)>; -def : Pat<(SPUvec2prefslot_chained (v2f64 VECREG:$rA)), - (ORf64_v2f64 VECREG:$rA, VECREG:$rA)>; - // ORC: Bitwise "or" with complement (c = a | ~b) class ORCInst pattern>: @@ -2147,15 +2129,6 @@ multiclass RotateQuadLeftByBytes defm ROTQBY: RotateQuadLeftByBytes; -def : Pat<(SPUrotbytes_left_chained (v16i8 VECREG:$rA), R32C:$rB), - (ROTQBYv16i8 VECREG:$rA, R32C:$rB)>; -def : Pat<(SPUrotbytes_left_chained (v8i16 VECREG:$rA), R32C:$rB), - (ROTQBYv8i16 VECREG:$rA, R32C:$rB)>; -def : Pat<(SPUrotbytes_left_chained (v4i32 VECREG:$rA), R32C:$rB), - (ROTQBYv4i32 VECREG:$rA, R32C:$rB)>; -def : Pat<(SPUrotbytes_left_chained (v2i64 VECREG:$rA), R32C:$rB), - (ROTQBYv2i64 VECREG:$rA, R32C:$rB)>; - //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // Rotate quad by byte (count), immediate //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ @@ -2179,15 +2152,6 @@ multiclass RotateQuadByBytesImm defm ROTQBYI: RotateQuadByBytesImm; -def : Pat<(SPUrotbytes_left_chained (v16i8 VECREG:$rA), (i16 uimm7:$val)), - (ROTQBYIv16i8 VECREG:$rA, uimm7:$val)>; -def : Pat<(SPUrotbytes_left_chained (v8i16 VECREG:$rA), (i16 uimm7:$val)), - (ROTQBYIv8i16 VECREG:$rA, uimm7:$val)>; -def : Pat<(SPUrotbytes_left_chained (v4i32 VECREG:$rA), (i16 uimm7:$val)), - (ROTQBYIv4i32 VECREG:$rA, uimm7:$val)>; -def : Pat<(SPUrotbytes_left_chained (v2i64 VECREG:$rA), (i16 uimm7:$val)), - (ROTQBYIv2i64 VECREG:$rA, uimm7:$val)>; - // See ROTQBY note above. class ROTQBYBIInst pattern>: RI7Form<0b00110011100, OOL, IOL, @@ -3972,10 +3936,6 @@ def : Pat<(ret), // Zero/Any/Sign extensions //===----------------------------------------------------------------------===// -// zext 1->32: Zero extend i1 to i32 -def : Pat<(SPUextract_i1_zext R32C:$rSrc), - (ANDIr32 R32C:$rSrc, 0x1)>; - // sext 8->32: Sign extend bytes to words def : Pat<(sext_inreg R32C:$rSrc, i8), (XSHWr32 (XSBHr32 R32C:$rSrc))>; @@ -3983,19 +3943,10 @@ def : Pat<(sext_inreg R32C:$rSrc, i8), def : Pat<(i32 (sext R8C:$rSrc)), (XSHWr16 (XSBHr8 R8C:$rSrc))>; -def : Pat<(SPUextract_i8_sext VECREG:$rSrc), - (XSHWr32 (XSBHr32 (ORi32_v4i32 (v4i32 VECREG:$rSrc), - (v4i32 VECREG:$rSrc))))>; - // zext 8->16: Zero extend bytes to halfwords def : Pat<(i16 (zext R8C:$rSrc)), (ANDHIi8i16 R8C:$rSrc, 0xff)>; -// zext 8->32 from preferred slot in load/store -def : Pat<(SPUextract_i8_zext VECREG:$rSrc), - (ANDIr32 (ORi32_v4i32 (v4i32 VECREG:$rSrc), (v4i32 VECREG:$rSrc)), - 0xff)>; - // zext 8->32: Zero extend bytes to words def : Pat<(i32 (zext R8C:$rSrc)), (ANDIi8i32 R8C:$rSrc, 0xff)>; diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td index c3b0c8c8909..1ed1e3ba51e 100644 --- a/lib/Target/CellSPU/SPUNodes.td +++ b/lib/Target/CellSPU/SPUNodes.td @@ -125,11 +125,6 @@ def SPUrotquad_rz_bits: SDNode<"SPUISD::ROTQUAD_RZ_BITS", def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT", SPUvecshift_type, []>; -// Same as above, but the node also has a chain associated (used in loads and -// stores) -def SPUrotbytes_left_chained : SDNode<"SPUISD::ROTBYTES_LEFT_CHAINED", - SPUvecshift_type, [SDNPHasChain]>; - // Vector rotate left by bytes, but the count is given in bits and the SPU // internally converts it to bytes (saves an instruction to mask off lower // three bits) @@ -153,13 +148,6 @@ def SPUpromote_scalar: SDNode<"SPUISD::PROMOTE_SCALAR", SDTpromote_scalar, []>; def SPU_vec_demote : SDTypeProfile<1, 1, []>; def SPUvec2prefslot: SDNode<"SPUISD::VEC2PREFSLOT", SPU_vec_demote, []>; -def SPU_vec_demote_chained : SDTypeProfile<1, 2, []>; -def SPUvec2prefslot_chained: SDNode<"SPUISD::VEC2PREFSLOT_CHAINED", - SPU_vec_demote_chained, [SDNPHasChain]>; -def SPUextract_i1_sext: SDNode<"SPUISD::EXTRACT_I1_SEXT", SPU_vec_demote, []>; -def SPUextract_i1_zext: SDNode<"SPUISD::EXTRACT_I1_ZEXT", SPU_vec_demote, []>; -def SPUextract_i8_sext: SDNode<"SPUISD::EXTRACT_I8_SEXT", SPU_vec_demote, []>; -def SPUextract_i8_zext: SDNode<"SPUISD::EXTRACT_I8_ZEXT", SPU_vec_demote, []>; // Address high and low components, used for [r+r] type addressing def SPUhi : SDNode<"SPUISD::Hi", SDTIntBinOp, []>;