diff --git a/lib/Target/CellSPU/SPU.td b/lib/Target/CellSPU/SPU.td index a5db1d9d2b5..8327fe03d7f 100644 --- a/lib/Target/CellSPU/SPU.td +++ b/lib/Target/CellSPU/SPU.td @@ -15,6 +15,13 @@ // include "llvm/Target/Target.td" +// Holder of code fragments (you'd think this'd already be in +// a td file somewhere... :-) + +class CodeFrag { + dag Fragment = frag; +} + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td index 6d679bac724..4159133770d 100644 --- a/lib/Target/CellSPU/SPU64InstrInfo.td +++ b/lib/Target/CellSPU/SPU64InstrInfo.td @@ -1,8 +1,17 @@ +//====--- SPU64InstrInfo.td - Cell SPU 64-bit operations -*- tablegen -*--====// +// +// Cell SPU 64-bit operations +// +// Primary author: Scott Michel (scottm@aero.org) +//===----------------------------------------------------------------------===// + //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // 64-bit comparisons: // // 1. The instruction sequences for vector vice scalar differ by a -// constant. +// constant. In the scalar case, we're only interested in the +// top two 32-bit slots, whereas we're interested in an exact +// all-four-slot match in the vector case. // // 2. There are no "immediate" forms, since loading 64-bit constants // could be a constant pool load. @@ -10,10 +19,10 @@ // 3. i64 setcc results are i32, which are subsequently converted to a FSM // mask when used in a select pattern. // -// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask -// (TODO) +// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO) +// [Note: this may be moot, since gb produces v4i32 or r32.] // -// M00$E Kan be Pretty N@sTi!!!!! (appologies to Monty!) +// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!) //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // selb instruction definition for i64. Note that the selection mask is @@ -22,17 +31,15 @@ def SELBr64_cond: SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC), [/* no pattern */]>; -class CodeFrag { - dag Fragment = frag; -} - -class I64SELECTNegCond: +// select the negative condition: +class I64SELECTNegCond: Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse), - (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 cmpare.Fragment))>; + (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>; -class I64SETCCNegCond: +// setcc the negative condition: +class I64SETCCNegCond: Pat<(cond R64C:$rA, R64C:$rB), - (XORIr32 cmpare.Fragment, -1)>; + (XORIr32 compare.Fragment, -1)>; // The i64 seteq fragment that does the scalar->vector conversion and // comparison: @@ -64,14 +71,13 @@ multiclass CompareEqual64 { defm I64EQ: CompareEqual64; def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>; +def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>; -def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), - I64EQv2i64.Fragment>; - -def I64Select: - Pat<(select R32C:$rC, R64C:$rB, R64C:$rA), - (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>; +def : Pat<(select R32C:$rC, R64C:$rB, R64C:$rA), + (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>; +// i64 setne: def : I64SETCCNegCond; +def : I64SELECTNegCond; -def : I64SELECTNegCond; \ No newline at end of file +// i64 setugt: diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index f51aba2fda6..76b22843696 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -149,7 +149,7 @@ namespace { } bool - isHighLow(const SDValue &Op) + isHighLow(const SDValue &Op) { return (Op.getOpcode() == SPUISD::IndirectAddr && ((Op.getOperand(0).getOpcode() == SPUISD::Hi @@ -229,14 +229,14 @@ public: TM(tm), SPUtli(*tm.getTargetLowering()) {} - + virtual bool runOnFunction(Function &Fn) { // Make sure we re-emit a set of the global base reg if necessary GlobalBaseReg = 0; SelectionDAGISel::runOnFunction(Fn); return true; } - + /// getI32Imm - Return a target constant with the specified value, of type /// i32. inline SDValue getI32Imm(uint32_t Imm) { @@ -248,7 +248,7 @@ public: inline SDValue getI64Imm(uint64_t Imm) { return CurDAG->getTargetConstant(Imm, MVT::i64); } - + /// getSmallIPtrImm - Return a target constant of pointer type. inline SDValue getSmallIPtrImm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy()); @@ -258,6 +258,15 @@ public: /// target-specific node if it hasn't already been changed. SDNode *Select(SDValue Op); + //! Emit the instruction sequence for i64 shl + SDNode *SelectSHLi64(SDValue &Op, MVT OpVT); + + //! Emit the instruction sequence for i64 srl + SDNode *SelectSRLi64(SDValue &Op, MVT OpVT); + + //! Emit the instruction sequence for i64 sra + SDNode *SelectSRAi64(SDValue &Op, MVT OpVT); + //! Returns true if the address N is an A-form (local store) address bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base, SDValue &Index); @@ -287,7 +296,7 @@ public: switch (ConstraintCode) { default: return true; case 'm': // memory - if (!SelectDFormAddr(Op, Op, Op0, Op1) + if (!SelectDFormAddr(Op, Op, Op0, Op1) && !SelectAFormAddr(Op, Op, Op0, Op1)) SelectXFormAddr(Op, Op, Op0, Op1); break; @@ -306,7 +315,7 @@ public: #endif break; } - + OutOps.push_back(Op0); OutOps.push_back(Op1); return false; @@ -318,14 +327,14 @@ public: virtual const char *getPassName() const { return "Cell SPU DAG->DAG Pattern Instruction Selection"; - } - + } + /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for /// this target when scheduling the DAG. virtual HazardRecognizer *CreateTargetHazardRecognizer() { const TargetInstrInfo *II = TM.getInstrInfo(); assert(II && "No InstrInfo?"); - return new SPUHazardRecognizer(*II); + return new SPUHazardRecognizer(*II); } // Include the pieces autogenerated from the target description. @@ -375,7 +384,7 @@ SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base, abort(); /*NOTREACHED*/ - case SPUISD::AFormAddr: + case SPUISD::AFormAddr: // Just load from memory if there's only a single use of the location, // otherwise, this will get handled below with D-form offset addresses if (N.hasOneUse()) { @@ -404,7 +413,7 @@ SPUDAGToDAGISel::SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base, return false; } -bool +bool SPUDAGToDAGISel::SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp, SDValue &Base) { const int minDForm2Offset = -(1 << 7); @@ -527,7 +536,7 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Base, ConstantSDNode *CN = cast(Op0); offset = int32_t(CN->getSExtValue()); idxOp = Op1; - } + } if (offset >= minOffset && offset <= maxOffset) { Base = CurDAG->getTargetConstant(offset, PtrTy); @@ -622,27 +631,20 @@ SPUDAGToDAGISel::Select(SDValue Op) { if (N->isMachineOpcode()) { return NULL; // Already selected. } else if (Opc == ISD::FrameIndex) { - // Selects to (add $sp, FI * stackSlotSize) - int FI = - SPUFrameInfo::FItoStackOffset(cast(N)->getIndex()); - MVT PtrVT = SPUtli.getPointerTy(); + int FI = cast(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType()); + SDValue Imm0 = CurDAG->getTargetConstant(0, Op.getValueType()); - // Adjust stack slot to actual offset in frame: - if (isS10Constant(FI)) { - DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with AIr32 $sp, " - << FI - << "\n"); + if (FI < 128) { NewOpc = SPU::AIr32; - Ops[0] = CurDAG->getRegister(SPU::R1, PtrVT); - Ops[1] = CurDAG->getTargetConstant(FI, PtrVT); + Ops[0] = TFI; + Ops[1] = Imm0; n_ops = 2; } else { - DEBUG(cerr << "SPUDAGToDAGISel: Replacing FrameIndex with Ar32 $sp, " - << FI - << "\n"); NewOpc = SPU::Ar32; - Ops[0] = CurDAG->getRegister(SPU::R1, PtrVT); - Ops[1] = CurDAG->getConstant(FI, PtrVT); + Ops[0] = CurDAG->getRegister(SPU::R1, Op.getValueType()); + Ops[1] = SDValue(CurDAG->getTargetNode(SPU::ILAr32, Op.getValueType(), + TFI, Imm0), 0); n_ops = 2; } } else if (Opc == ISD::ZERO_EXTEND) { @@ -661,6 +663,18 @@ SPUDAGToDAGISel::Select(SDValue Op) { n_ops = 2; } } + } else if (Opc == ISD::SHL) { + if (OpVT == MVT::i64) { + return SelectSHLi64(Op, OpVT); + } + } else if (Opc == ISD::SRL) { + if (OpVT == MVT::i64) { + return SelectSRLi64(Op, OpVT); + } + } else if (Opc == ISD::SRA) { + if (OpVT == MVT::i64) { + return SelectSRAi64(Op, OpVT); + } } else if (Opc == SPUISD::LDRESULT) { // Custom select instructions for LDRESULT MVT VT = N->getValueType(0); @@ -713,7 +727,7 @@ SPUDAGToDAGISel::Select(SDValue Op) { n_ops = 2; } } - + if (n_ops > 0) { if (N->hasOneUse()) return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops); @@ -723,7 +737,213 @@ SPUDAGToDAGISel::Select(SDValue Op) { return SelectCode(Op); } -/// createPPCISelDag - This pass converts a legalized DAG into a +/*! + * Emit the instruction sequence for i64 left shifts. The basic algorithm + * is to fill the bottom two word slots with zeros so that zeros are shifted + * in as the entire quadword is shifted left. + * + * \note This code could also be used to implement v2i64 shl. + * + * @param Op The shl operand + * @param OpVT Op's machine value value type (doesn't need to be passed, but + * makes life easier.) + * @return The SDNode with the entire instruction sequence + */ +SDNode * +SPUDAGToDAGISel::SelectSHLi64(SDValue &Op, MVT OpVT) { + SDValue Op0 = Op.getOperand(0); + MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits())); + SDValue ShiftAmt = Op.getOperand(1); + MVT ShiftAmtVT = ShiftAmt.getValueType(); + SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0; + SDValue SelMaskVal; + + VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op0); + SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16); + SelMask = CurDAG->getTargetNode(SPU::FSMBIv2i64, VecVT, SelMaskVal); + ZeroFill = CurDAG->getTargetNode(SPU::ILv2i64, VecVT, + CurDAG->getTargetConstant(0, OpVT)); + VecOp0 = CurDAG->getTargetNode(SPU::SELBv2i64, VecVT, + SDValue(ZeroFill, 0), + SDValue(VecOp0, 0), + SDValue(SelMask, 0)); + + if (ConstantSDNode *CN = dyn_cast(ShiftAmt)) { + unsigned bytes = unsigned(CN->getZExtValue()) >> 3; + unsigned bits = unsigned(CN->getZExtValue()) & 7; + + if (bytes > 0) { + Shift = + CurDAG->getTargetNode(SPU::SHLQBYIv2i64, VecVT, + SDValue(VecOp0, 0), + CurDAG->getTargetConstant(bytes, ShiftAmtVT)); + } + + if (bits > 0) { + Shift = + CurDAG->getTargetNode(SPU::SHLQBIIv2i64, VecVT, + SDValue((Shift != 0 ? Shift : VecOp0), 0), + CurDAG->getTargetConstant(bits, ShiftAmtVT)); + } + } else { + SDNode *Bytes = + CurDAG->getTargetNode(SPU::ROTMIr32, ShiftAmtVT, + ShiftAmt, + CurDAG->getTargetConstant(3, ShiftAmtVT)); + SDNode *Bits = + CurDAG->getTargetNode(SPU::ANDIr32, ShiftAmtVT, + ShiftAmt, + CurDAG->getTargetConstant(7, ShiftAmtVT)); + Shift = + CurDAG->getTargetNode(SPU::SHLQBYv2i64, VecVT, + SDValue(VecOp0, 0), SDValue(Bytes, 0)); + Shift = + CurDAG->getTargetNode(SPU::SHLQBIv2i64, VecVT, + SDValue(Shift, 0), SDValue(Bits, 0)); + } + + return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0)); +} + +/*! + * Emit the instruction sequence for i64 logical right shifts. + * + * @param Op The shl operand + * @param OpVT Op's machine value value type (doesn't need to be passed, but + * makes life easier.) + * @return The SDNode with the entire instruction sequence + */ +SDNode * +SPUDAGToDAGISel::SelectSRLi64(SDValue &Op, MVT OpVT) { + SDValue Op0 = Op.getOperand(0); + MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits())); + SDValue ShiftAmt = Op.getOperand(1); + MVT ShiftAmtVT = ShiftAmt.getValueType(); + SDNode *VecOp0, *Shift = 0; + + VecOp0 = CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op0); + + if (ConstantSDNode *CN = dyn_cast(ShiftAmt)) { + unsigned bytes = unsigned(CN->getZExtValue()) >> 3; + unsigned bits = unsigned(CN->getZExtValue()) & 7; + + if (bytes > 0) { + Shift = + CurDAG->getTargetNode(SPU::ROTQMBYIv2i64, VecVT, + SDValue(VecOp0, 0), + CurDAG->getTargetConstant(bytes, ShiftAmtVT)); + } + + if (bits > 0) { + Shift = + CurDAG->getTargetNode(SPU::ROTQMBIIv2i64, VecVT, + SDValue((Shift != 0 ? Shift : VecOp0), 0), + CurDAG->getTargetConstant(bits, ShiftAmtVT)); + } + } else { + SDNode *Bytes = + CurDAG->getTargetNode(SPU::ROTMIr32, ShiftAmtVT, + ShiftAmt, + CurDAG->getTargetConstant(3, ShiftAmtVT)); + SDNode *Bits = + CurDAG->getTargetNode(SPU::ANDIr32, ShiftAmtVT, + ShiftAmt, + CurDAG->getTargetConstant(7, ShiftAmtVT)); + + // Ensure that the shift amounts are negated! + Bytes = CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT, + SDValue(Bytes, 0), + CurDAG->getTargetConstant(0, ShiftAmtVT)); + + Bits = CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT, + SDValue(Bits, 0), + CurDAG->getTargetConstant(0, ShiftAmtVT)); + + Shift = + CurDAG->getTargetNode(SPU::ROTQMBYv2i64, VecVT, + SDValue(VecOp0, 0), SDValue(Bytes, 0)); + Shift = + CurDAG->getTargetNode(SPU::ROTQMBIv2i64, VecVT, + SDValue(Shift, 0), SDValue(Bits, 0)); + } + + return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0)); +} + +/*! + * Emit the instruction sequence for i64 arithmetic right shifts. + * + * @param Op The shl operand + * @param OpVT Op's machine value value type (doesn't need to be passed, but + * makes life easier.) + * @return The SDNode with the entire instruction sequence + */ +SDNode * +SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) { + // Promote Op0 to vector + MVT VecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits())); + SDValue ShiftAmt = Op.getOperand(1); + MVT ShiftAmtVT = ShiftAmt.getValueType(); + + SDNode *VecOp0 = + CurDAG->getTargetNode(SPU::ORv2i64_i64, VecVT, Op.getOperand(0)); + + SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT); + SDNode *SignRot = + CurDAG->getTargetNode(SPU::ROTMAIv2i64_i32, MVT::v2i64, + SDValue(VecOp0, 0), SignRotAmt); + SDNode *UpperHalfSign = + CurDAG->getTargetNode(SPU::ORi32_v4i32, MVT::i32, SDValue(SignRot, 0)); + + SDNode *UpperHalfSignMask = + CurDAG->getTargetNode(SPU::FSM64r32, VecVT, SDValue(UpperHalfSign, 0)); + SDNode *UpperLowerMask = + CurDAG->getTargetNode(SPU::FSMBIv2i64, VecVT, + CurDAG->getTargetConstant(0xff00ULL, MVT::i16)); + SDNode *UpperLowerSelect = + CurDAG->getTargetNode(SPU::SELBv2i64, VecVT, + SDValue(UpperHalfSignMask, 0), + SDValue(VecOp0, 0), + SDValue(UpperLowerMask, 0)); + + SDNode *Shift = 0; + + if (ConstantSDNode *CN = dyn_cast(ShiftAmt)) { + unsigned bytes = unsigned(CN->getZExtValue()) >> 3; + unsigned bits = unsigned(CN->getZExtValue()) & 7; + + if (bytes > 0) { + bytes = 31 - bytes; + Shift = + CurDAG->getTargetNode(SPU::ROTQBYIv2i64, VecVT, + SDValue(UpperLowerSelect, 0), + CurDAG->getTargetConstant(bytes, ShiftAmtVT)); + } + + if (bits > 0) { + bits = 8 - bits; + Shift = + CurDAG->getTargetNode(SPU::ROTQBIIv2i64, VecVT, + SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0), + CurDAG->getTargetConstant(bits, ShiftAmtVT)); + } + } else { + SDNode *NegShift = + CurDAG->getTargetNode(SPU::SFIr32, ShiftAmtVT, + ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT)); + + Shift = + CurDAG->getTargetNode(SPU::ROTQBYBIv2i64_r32, VecVT, + SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0)); + Shift = + CurDAG->getTargetNode(SPU::ROTQBIv2i64, VecVT, + SDValue(Shift, 0), SDValue(NegShift, 0)); + } + + return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0)); +} + +/// createSPUISelDag - This pass converts a legalized DAG into a /// SPU-specific DAG, ready for instruction scheduling. /// FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) { diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 0822181d3b7..5ccfd14aa4b 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -204,10 +204,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SRL, MVT::i8, Custom); setOperationAction(ISD::SRA, MVT::i8, Custom); - // SPU needs custom lowering for shift left/right for i64 - setOperationAction(ISD::SHL, MVT::i64, Custom); - setOperationAction(ISD::SRL, MVT::i64, Custom); - setOperationAction(ISD::SRA, MVT::i64, Custom); + // Make these operations legal and handle them during instruction selection: + setOperationAction(ISD::SHL, MVT::i64, Legal); + setOperationAction(ISD::SRL, MVT::i64, Legal); + setOperationAction(ISD::SRA, MVT::i64, Legal); // Custom lower i8, i32 and i64 multiplications setOperationAction(ISD::MUL, MVT::i8, Custom); @@ -215,6 +215,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::MUL, MVT::i64, Expand); // libcall // Need to custom handle (some) common i8, i64 math ops + setOperationAction(ISD::ADD, MVT::i8, Custom); setOperationAction(ISD::ADD, MVT::i64, Custom); setOperationAction(ISD::SUB, MVT::i8, Custom); setOperationAction(ISD::SUB, MVT::i64, Custom); @@ -249,7 +250,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) // Zero extension and sign extension for i64 have to be // custom legalized setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); // Custom lower i128 -> i64 truncates @@ -262,7 +262,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); // FDIV on SPU requires custom lowering - setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Expand); // libcall // SPU has [U|S]INT_TO_FP @@ -340,7 +339,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::ADD , VT, Legal); setOperationAction(ISD::SUB , VT, Legal); // mul has to be custom lowered. - setOperationAction(ISD::MUL , VT, Custom); + // TODO: v2i64 vector multiply + setOperationAction(ISD::MUL , VT, Legal); setOperationAction(ISD::AND , VT, Legal); setOperationAction(ISD::OR , VT, Legal); @@ -354,7 +354,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::FDIV, VT, Custom); // Custom lower build_vector, constant pool spills, insert and // extract vector elements: @@ -371,9 +370,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::XOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); - // FIXME: This is only temporary until I put all vector multiplications in - // SPUInstrInfo.td: - setOperationAction(ISD::MUL, MVT::v4i32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setShiftAmountType(MVT::i32); setBooleanContents(ZeroOrNegativeOneBooleanContent); @@ -411,10 +408,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY"; - node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU"; - node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH"; - node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH"; node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL"; @@ -422,21 +415,12 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; - node_names[(unsigned) SPUISD::ROTQUAD_RZ_BYTES] = - "SPUISD::ROTQUAD_RZ_BYTES"; - node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] = - "SPUISD::ROTQUAD_RZ_BITS"; - node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; - node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] = - "SPUISD::ROTBYTES_LEFT_BITS"; node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK"; node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB"; node_names[(unsigned) SPUISD::ADD_EXTENDED] = "SPUISD::ADD_EXTENDED"; node_names[(unsigned) SPUISD::CARRY_GENERATE] = "SPUISD::CARRY_GENERATE"; node_names[(unsigned) SPUISD::SUB_EXTENDED] = "SPUISD::SUB_EXTENDED"; node_names[(unsigned) SPUISD::BORROW_GENERATE] = "SPUISD::BORROW_GENERATE"; - node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp"; - node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst"; node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64"; } @@ -1922,182 +1906,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { return SDValue(); } -static SDValue LowerVectorMUL(SDValue Op, SelectionDAG &DAG) { - switch (Op.getValueType().getSimpleVT()) { - default: - cerr << "CellSPU: Unknown vector multiplication, got " - << Op.getValueType().getMVTString() - << "\n"; - abort(); - /*NOTREACHED*/ - - case MVT::v4i32: - break; - - // Multiply two v8i16 vectors (pipeline friendly version): - // a) multiply lower halves, mask off upper 16-bit of 32-bit product - // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes) - // c) Use SELB to select upper and lower halves from the intermediate results - // - // NOTE: We really want to move the SELECT_MASK to earlier to actually get the - // dual-issue. This code does manage to do this, even if it's a little on - // the wacky side - case MVT::v8i16: { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - SDValue Chain = Op.getOperand(0); - SDValue rA = Op.getOperand(0); - SDValue rB = Op.getOperand(1); - unsigned FSMBIreg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass); - unsigned HiProdReg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass); - - SDValue FSMBOp = - DAG.getCopyToReg(Chain, FSMBIreg, - DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16, - DAG.getConstant(0xcccc, MVT::i16))); - - SDValue HHProd = - DAG.getCopyToReg(FSMBOp, HiProdReg, - DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB)); - - SDValue HHProd_v4i32 = - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, - DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32)); - - return DAG.getNode(SPUISD::SELB, MVT::v8i16, - DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB), - DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), - DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32, - HHProd_v4i32, - DAG.getConstant(16, MVT::i16))), - DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32)); - } - - // This M00sE is N@stI! (apologies to Monty Python) - // - // SPU doesn't know how to do any 8-bit multiplication, so the solution - // is to break it all apart, sign extend, and reassemble the various - // intermediate products. - case MVT::v16i8: { - SDValue rA = Op.getOperand(0); - SDValue rB = Op.getOperand(1); - SDValue c8 = DAG.getConstant(8, MVT::i32); - SDValue c16 = DAG.getConstant(16, MVT::i32); - - SDValue LLProd = - DAG.getNode(SPUISD::MPY, MVT::v8i16, - DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA), - DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB)); - - SDValue rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8); - - SDValue rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8); - - SDValue LHProd = - DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, - DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8); - - SDValue FSMBmask = DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16, - DAG.getConstant(0x2222, MVT::i16)); - - SDValue LoProdParts = - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, - DAG.getNode(SPUISD::SELB, MVT::v8i16, - LLProd, LHProd, FSMBmask)); - - SDValue LoProdMask = DAG.getConstant(0xffff, MVT::i32); - - SDValue LoProd = - DAG.getNode(ISD::AND, MVT::v4i32, - LoProdParts, - DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, - LoProdMask, LoProdMask, - LoProdMask, LoProdMask)); - - SDValue rAH = - DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32, - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16); - - SDValue rBH = - DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32, - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16); - - SDValue HLProd = - DAG.getNode(SPUISD::MPY, MVT::v8i16, - DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH), - DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH)); - - SDValue HHProd_1 = - DAG.getNode(SPUISD::MPY, MVT::v8i16, - DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, - DAG.getNode(SPUISD::VEC_SRA, - MVT::v4i32, rAH, c8)), - DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, - DAG.getNode(SPUISD::VEC_SRA, - MVT::v4i32, rBH, c8))); - - SDValue HHProd = - DAG.getNode(SPUISD::SELB, MVT::v8i16, - HLProd, - DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8), - FSMBmask); - - SDValue HiProd = - DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32, HHProd, c16); - - return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, - DAG.getNode(ISD::OR, MVT::v4i32, - LoProd, HiProd)); - } - } - - return SDValue(); -} - -static SDValue LowerFDIVf32(SDValue Op, SelectionDAG &DAG) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - - SDValue A = Op.getOperand(0); - SDValue B = Op.getOperand(1); - MVT VT = Op.getValueType(); - - unsigned VRegBR, VRegC; - - if (VT == MVT::f32) { - VRegBR = RegInfo.createVirtualRegister(&SPU::R32FPRegClass); - VRegC = RegInfo.createVirtualRegister(&SPU::R32FPRegClass); - } else { - VRegBR = RegInfo.createVirtualRegister(&SPU::VECREGRegClass); - VRegC = RegInfo.createVirtualRegister(&SPU::VECREGRegClass); - } - // TODO: make sure we're feeding FPInterp the right arguments - // Right now: fi B, frest(B) - - // Computes BRcpl = - // (Floating Interpolate (FP Reciprocal Estimate B)) - SDValue BRcpl = - DAG.getCopyToReg(DAG.getEntryNode(), VRegBR, - DAG.getNode(SPUISD::FPInterp, VT, B, - DAG.getNode(SPUISD::FPRecipEst, VT, B))); - - // Computes A * BRcpl and stores in a temporary register - SDValue AxBRcpl = - DAG.getCopyToReg(BRcpl, VRegC, - DAG.getNode(ISD::FMUL, VT, A, - DAG.getCopyFromReg(BRcpl, VRegBR, VT))); - // What's the Chain variable do? It's magic! - // TODO: set Chain = Op(0).getEntryNode() - - return DAG.getNode(ISD::FADD, VT, - DAG.getCopyFromReg(AxBRcpl, VRegC, VT), - DAG.getNode(ISD::FMUL, VT, - DAG.getCopyFromReg(AxBRcpl, VRegBR, VT), - DAG.getNode(ISD::FSUB, VT, A, - DAG.getNode(ISD::FMUL, VT, B, - DAG.getCopyFromReg(AxBRcpl, VRegC, VT))))); -} - static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType(); SDValue N = Op.getOperand(0); @@ -2296,18 +2104,23 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc, assert(0 && "Unhandled i8 math operator"); /*NOTREACHED*/ break; + case ISD::ADD: { + // 8-bit addition: Promote the arguments up to 16-bits and truncate + // the result: + SDValue N1 = Op.getOperand(1); + N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1); + return DAG.getNode(ISD::TRUNCATE, MVT::i8, + DAG.getNode(Opc, MVT::i16, N0, N1)); + + } + case ISD::SUB: { // 8-bit subtraction: Promote the arguments up to 16-bits and truncate // the result: SDValue N1 = Op.getOperand(1); - N0 = (N0.getOpcode() != ISD::Constant - ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0) - : DAG.getConstant(cast(N0)->getSExtValue(), - MVT::i16)); - N1 = (N1.getOpcode() != ISD::Constant - ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1) - : DAG.getConstant(cast(N1)->getSExtValue(), - MVT::i16)); + N0 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0); + N1 = DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1); return DAG.getNode(ISD::TRUNCATE, MVT::i8, DAG.getNode(Opc, MVT::i16, N0, N1)); } @@ -2397,7 +2210,6 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc) switch (Opc) { case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { MVT Op0VT = Op0.getValueType(); MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits())); @@ -2410,39 +2222,16 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc) SDValue PromoteScalar = DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0); - if (Opc != ISD::SIGN_EXTEND) { - // Use a shuffle to zero extend the i32 to i64 directly: - SDValue shufMask = - DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT, - DAG.getConstant(0x80808080, MVT::i32), - DAG.getConstant(0x00010203, MVT::i32), - DAG.getConstant(0x80808080, MVT::i32), - DAG.getConstant(0x08090a0b, MVT::i32)); - SDValue zextShuffle = - DAG.getNode(SPUISD::SHUFB, Op0VecVT, - PromoteScalar, PromoteScalar, shufMask); + // Use a shuffle to zero extend the i32 to i64 directly: + SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT, + DAG.getConstant(0x80808080, MVT::i32), DAG.getConstant(0x00010203, + MVT::i32), DAG.getConstant(0x80808080, MVT::i32), DAG.getConstant( + 0x08090a0b, MVT::i32)); + SDValue zextShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT, PromoteScalar, + PromoteScalar, shufMask); - return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, - DAG.getNode(ISD::BIT_CONVERT, VecVT, zextShuffle)); - } else { - // SPU has no "rotate quadword and replicate bit 0" (i.e. rotate/shift - // right and propagate the sign bit) instruction. - SDValue RotQuad = - DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, Op0VecVT, - PromoteScalar, DAG.getConstant(4, MVT::i32)); - SDValue SignQuad = - DAG.getNode(SPUISD::VEC_SRA, Op0VecVT, - PromoteScalar, DAG.getConstant(32, MVT::i32)); - SDValue SelMask = - DAG.getNode(SPUISD::SELECT_MASK, Op0VecVT, - DAG.getConstant(0xf0f0, MVT::i16)); - SDValue CombineQuad = - DAG.getNode(SPUISD::SELB, Op0VecVT, - SignQuad, RotQuad, SelMask); - - return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, - DAG.getNode(ISD::BIT_CONVERT, VecVT, CombineQuad)); - } + return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, DAG.getNode(ISD::BIT_CONVERT, + VecVT, zextShuffle)); } case ISD::ADD: { @@ -2502,88 +2291,6 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc) DAG.getNode(SPUISD::SUB_EXTENDED, MVT::v2i64, Op0, Op1, ShiftedBorrow)); } - - case ISD::SHL: { - SDValue ShiftAmt = Op.getOperand(1); - MVT ShiftAmtVT = ShiftAmt.getValueType(); - SDValue Op0Vec = DAG.getNode(SPUISD::PREFSLOT2VEC, VecVT, Op0); - SDValue MaskLower = - DAG.getNode(SPUISD::SELB, VecVT, - Op0Vec, - DAG.getConstant(0, VecVT), - DAG.getNode(SPUISD::SELECT_MASK, VecVT, - DAG.getConstant(0xff00ULL, MVT::i16))); - SDValue ShiftAmtBytes = - DAG.getNode(ISD::SRL, ShiftAmtVT, - ShiftAmt, - DAG.getConstant(3, ShiftAmtVT)); - SDValue ShiftAmtBits = - DAG.getNode(ISD::AND, ShiftAmtVT, - ShiftAmt, - DAG.getConstant(7, ShiftAmtVT)); - - return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, - DAG.getNode(SPUISD::SHLQUAD_L_BITS, VecVT, - DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT, - MaskLower, ShiftAmtBytes), - ShiftAmtBits)); - } - - case ISD::SRL: { - MVT VT = Op.getValueType(); - SDValue ShiftAmt = Op.getOperand(1); - MVT ShiftAmtVT = ShiftAmt.getValueType(); - SDValue ShiftAmtBytes = - DAG.getNode(ISD::SRL, ShiftAmtVT, - ShiftAmt, - DAG.getConstant(3, ShiftAmtVT)); - SDValue ShiftAmtBits = - DAG.getNode(ISD::AND, ShiftAmtVT, - ShiftAmt, - DAG.getConstant(7, ShiftAmtVT)); - - return DAG.getNode(SPUISD::ROTQUAD_RZ_BITS, VT, - DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, VT, - Op0, ShiftAmtBytes), - ShiftAmtBits); - } - - case ISD::SRA: { - // Promote Op0 to vector - SDValue Op0 = - DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0)); - SDValue ShiftAmt = Op.getOperand(1); - MVT ShiftVT = ShiftAmt.getValueType(); - - // Negate variable shift amounts - if (!isa(ShiftAmt)) { - ShiftAmt = DAG.getNode(ISD::SUB, ShiftVT, - DAG.getConstant(0, ShiftVT), ShiftAmt); - } - - SDValue UpperHalfSign = - DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, - DAG.getNode(SPUISD::VEC_SRA, MVT::v2i64, - Op0, DAG.getConstant(31, MVT::i32)))); - SDValue UpperHalfSignMask = - DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64, UpperHalfSign); - SDValue UpperLowerMask = - DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64, - DAG.getConstant(0xff00, MVT::i16)); - SDValue UpperLowerSelect = - DAG.getNode(SPUISD::SELB, MVT::v2i64, - UpperHalfSignMask, Op0, UpperLowerMask); - SDValue RotateLeftBytes = - DAG.getNode(SPUISD::ROTBYTES_LEFT_BITS, MVT::v2i64, - UpperLowerSelect, ShiftAmt); - SDValue RotateLeftBits = - DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v2i64, - RotateLeftBytes, ShiftAmt); - - return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64, - RotateLeftBits); - } } return SDValue(); @@ -2890,10 +2597,11 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) return LowerRET(Op, DAG, getTargetMachine()); - // i8, i64 math ops: case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: + return LowerI64Math(Op, DAG, Opc); + + // i8, i64 math ops: case ISD::ADD: case ISD::SUB: case ISD::ROTR: @@ -2928,22 +2636,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) // Vector and i8 multiply: case ISD::MUL: - if (VT.isVector()) - return LowerVectorMUL(Op, DAG); - else if (VT == MVT::i8) + if (VT == MVT::i8) return LowerI8Math(Op, DAG, Opc, *this); - case ISD::FDIV: - if (VT == MVT::f32 || VT == MVT::v4f32) - return LowerFDIVf32(Op, DAG); -#if 0 - // This is probably a libcall - else if (Op.getValueType() == MVT::f64) - return LowerFDIVf64(Op, DAG); -#endif - else - assert(0 && "Calling FDIV on unsupported MVT"); - case ISD::CTPOP: return LowerCTPOP(Op, DAG); @@ -3119,8 +2814,6 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const case SPUISD::VEC_SHL: case SPUISD::VEC_SRL: case SPUISD::VEC_SRA: - case SPUISD::ROTQUAD_RZ_BYTES: - case SPUISD::ROTQUAD_RZ_BITS: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); @@ -3268,10 +2961,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, } #if 0 - case MPY: - case MPYU: - case MPYH: - case MPYHH: case SPUISD::SHLQUAD_L_BITS: case SPUISD::SHLQUAD_L_BYTES: case SPUISD::VEC_SHL: @@ -3279,18 +2968,14 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case SPUISD::VEC_SRA: case SPUISD::VEC_ROTL: case SPUISD::VEC_ROTR: - case SPUISD::ROTQUAD_RZ_BYTES: - case SPUISD::ROTQUAD_RZ_BITS: case SPUISD::ROTBYTES_LEFT: case SPUISD::SELECT_MASK: case SPUISD::SELB: - case SPUISD::FPInterp: - case SPUISD::FPRecipEst: case SPUISD::SEXT32TO64: #endif } } - + unsigned SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, unsigned Depth) const { diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index 8d2e9945455..0eed9b0cfc5 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -24,10 +24,10 @@ namespace llvm { enum NodeType { // Start the numbering where the builting ops and target ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, - + // Pseudo instructions: RET_FLAG, ///< Return with flag, matched by bi instruction - + Hi, ///< High address component (upper 16) Lo, ///< Low address component (lower 16) PCRelAddr, ///< Program counter relative address @@ -41,10 +41,6 @@ namespace llvm { CNTB, ///< Count leading ones in bytes PREFSLOT2VEC, ///< Promote scalar->vector VEC2PREFSLOT, ///< Extract element 0 - MPY, ///< 16-bit Multiply (low parts of a 32-bit) - MPYU, ///< Multiply Unsigned - MPYH, ///< Multiply High - MPYHH, ///< Multiply High-High SHLQUAD_L_BITS, ///< Rotate quad left, by bits SHLQUAD_L_BYTES, ///< Rotate quad left, by bytes VEC_SHL, ///< Vector shift left @@ -52,8 +48,6 @@ namespace llvm { VEC_SRA, ///< Vector shift right (arithmetic) VEC_ROTL, ///< Vector rotate left VEC_ROTR, ///< Vector rotate right - ROTQUAD_RZ_BYTES, ///< Rotate quad right, by bytes, zero fill - ROTQUAD_RZ_BITS, ///< Rotate quad right, by bits, zero fill ROTBYTES_LEFT, ///< Rotate bytes (loads -> ROTQBYI) ROTBYTES_LEFT_BITS, ///< Rotate bytes left by bit shift count SELECT_MASK, ///< Select Mask (FSM, FSMB, FSMH, FSMBI) @@ -63,8 +57,6 @@ namespace llvm { CARRY_GENERATE, ///< Carry generate for ADD_EXTENDED SUB_EXTENDED, ///< Subtract extended, with borrow BORROW_GENERATE, ///< Borrow generate for SUB_EXTENDED - FPInterp, ///< Floating point interpolate - FPRecipEst, ///< Floating point reciprocal estimate SEXT32TO64, ///< Sign-extended 32-bit const -> 64-bits LAST_SPUISD ///< Last user-defined instruction }; @@ -87,7 +79,7 @@ namespace llvm { } class SPUTargetMachine; // forward dec'l. - + class SPUTargetLowering : public TargetLowering { @@ -97,14 +89,14 @@ namespace llvm { public: SPUTargetLowering(SPUTargetMachine &TM); - + /// getTargetNodeName() - This method returns the name of a target specific /// DAG node. virtual const char *getTargetNodeName(unsigned Opcode) const; /// getSetCCResultType - Return the ValueType for ISD::SETCC virtual MVT getSetCCResultType(const SDValue &) const; - + //! Custom lowering hooks virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG); @@ -116,7 +108,7 @@ namespace llvm { virtual void computeMaskedBitsForTargetNode(const SDValue Op, const APInt &Mask, - APInt &KnownZero, + APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const; @@ -126,12 +118,12 @@ namespace llvm { ConstraintType getConstraintType(const std::string &ConstraintLetter) const; - std::pair + std::pair getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const; void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter, - bool hasMemory, + bool hasMemory, std::vector &Ops, SelectionDAG &DAG) const; diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp index 37a58705795..3c8165fbbd7 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.cpp +++ b/lib/Target/CellSPU/SPUInstrInfo.cpp @@ -82,7 +82,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI, case SPU::ORIi8i32: case SPU::AHIvec: case SPU::AHIr16: - case SPU::AIvec: + case SPU::AIv4i32: assert(MI.getNumOperands() == 3 && MI.getOperand(0).isReg() && MI.getOperand(1).isReg() && @@ -98,8 +98,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI, assert(MI.getNumOperands() == 3 && "wrong number of operands to AIr32"); if (MI.getOperand(0).isReg() && - (MI.getOperand(1).isReg() || - MI.getOperand(1).isFI()) && + MI.getOperand(1).isReg() && (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0)) { sourceReg = MI.getOperand(1).getReg(); @@ -265,7 +264,7 @@ bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB, // reg class to any other reg class containing R3. This is required because // we instruction select bitconvert i64 -> f64 as a noop for example, so our // types have no specific meaning. - + if (DestRC == SPU::R8CRegisterClass) { BuildMI(MBB, MI, get(SPU::ORBIr8), DestReg).addReg(SrcReg).addImm(0); } else if (DestRC == SPU::R16CRegisterClass) { @@ -291,7 +290,7 @@ bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB, // Attempt to copy unknown/unsupported register class! return false; } - + return true; } @@ -464,7 +463,7 @@ SPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned OpNum = Ops[0]; unsigned Opc = MI->getOpcode(); MachineInstr *NewMI = 0; - + if ((Opc == SPU::ORr32 || Opc == SPU::ORv4i32) && MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { @@ -508,7 +507,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, // Get the last instruction in the block. MachineInstr *LastInst = I; - + // If there is only one terminator instruction, process it. if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { if (isUncondBranch(LastInst)) { @@ -524,7 +523,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, // Otherwise, don't know what this is. return true; } - + // Get the instruction before it if it's a terminator. MachineInstr *SecondLastInst = I; @@ -532,7 +531,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) return true; - + // If the block ends with a conditional and unconditional branch, handle it. if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) { TBB = SecondLastInst->getOperand(1).getMBB(); @@ -541,7 +540,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, FBB = LastInst->getOperand(0).getMBB(); return false; } - + // If the block ends with two unconditional branches, handle it. The second // one is not executed, so remove it. if (isUncondBranch(SecondLastInst) && isUncondBranch(LastInst)) { @@ -554,7 +553,7 @@ SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, // Otherwise, can't handle this. return true; } - + unsigned SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); @@ -578,16 +577,16 @@ SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { I->eraseFromParent(); return 2; } - + unsigned SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl &Cond) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - assert((Cond.size() == 2 || Cond.size() == 0) && + assert((Cond.size() == 2 || Cond.size() == 0) && "SPU branch conditions have two components!"); - + // One-way branch. if (FBB == 0) { if (Cond.empty()) // Unconditional branch @@ -600,7 +599,7 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, } return 1; } - + // Two-way Conditional Branch. #if 0 BuildMI(&MBB, get(SPU::BRNZ)) diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index 1abbc0a5c04..751f36e6972 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -583,7 +583,9 @@ def AHIvec: def AHIr16: RI10Form<0b10111000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val), "ahi\t$rT, $rA, $val", IntegerOp, - [(set R16C:$rT, (add R16C:$rA, v8i16SExt10Imm:$val))]>; + [(set R16C:$rT, (add R16C:$rA, i16ImmSExt10:$val))]>; + +// v4i32, i32 add instruction: class AInst pattern>: RRForm<0b00000011000, OOL, IOL, @@ -604,21 +606,42 @@ multiclass AddInstruction { def v16i8: AVecInst; def r32: ARegInst; - def r8: AInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB), [/* no pattern */]>; } defm A : AddInstruction; -def AIvec: - RI10Form<0b00111000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val), - "ai\t$rT, $rA, $val", IntegerOp, - [(set (v4i32 VECREG:$rT), (add (v4i32 VECREG:$rA), - v4i32SExt10Imm:$val))]>; +class AIInst pattern>: + RI10Form<0b00111000, OOL, IOL, + "ai\t$rT, $rA, $val", IntegerOp, + pattern>; -def AIr32: - RI10Form<0b00111000, (outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val), - "ai\t$rT, $rA, $val", IntegerOp, - [(set R32C:$rT, (add R32C:$rA, i32ImmSExt10:$val))]>; +class AIVecInst: + AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val), + [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA), immpred:$val))]>; + +class AIFPVecInst: + AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val), + [/* no pattern */]>; + +class AIRegInst: + AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val), + [(set rclass:$rT, (add rclass:$rA, immpred:$val))]>; + +// This is used to add epsilons to floating point numbers in the f32 fdiv code: +class AIFPInst: + AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val), + [/* no pattern */]>; + +multiclass AddImmediate { + def v4i32: AIVecInst; + + def r32: AIRegInst; + + def v4f32: AIFPVecInst; + def f32: AIFPInst; +} + +defm AI : AddImmediate; def SFHvec: RRForm<0b00010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), @@ -795,8 +818,7 @@ def BGXvec: def MPYv8i16: RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), "mpy\t$rT, $rA, $rB", IntegerMulDiv, - [(set (v8i16 VECREG:$rT), (SPUmpy_vec (v8i16 VECREG:$rA), - (v8i16 VECREG:$rB)))]>; + [/* no pattern */]>; def MPYr16: RRForm<0b00100011110, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB), @@ -812,8 +834,7 @@ class MPYUInst pattern>: def MPYUv4i32: MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - [(set (v4i32 VECREG:$rT), - (SPUmpyu_vec (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>; + [/* no pattern */]>; def MPYUr16: MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB), @@ -821,7 +842,7 @@ def MPYUr16: def MPYUr32: MPYUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB), - [(set R32C:$rT, (SPUmpyu_int R32C:$rA, R32C:$rB))]>; + [/* no pattern */]>; // mpyi: multiply 16 x s10imm -> 32 result. @@ -892,87 +913,78 @@ class MPYHInst pattern>: def MPYHv4i32: MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - [(set (v4i32 VECREG:$rT), - (SPUmpyh_vec (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>; + [/* no pattern */]>; def MPYHr32: MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB), - [(set R32C:$rT, (SPUmpyh_int R32C:$rA, R32C:$rB))]>; + [/* no pattern */]>; // mpys: multiply high and shift right (returns the top half of // a 16-bit multiply, sign extended to 32 bits.) -def MPYSvec: - RRForm<0b11100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - "mpys\t$rT, $rA, $rB", IntegerMulDiv, - []>; -def MPYSr16: - RRForm<0b11100011110, (outs R32C:$rT), (ins R16C:$rA, R16C:$rB), +class MPYSInst: + RRForm<0b11100011110, OOL, IOL, "mpys\t$rT, $rA, $rB", IntegerMulDiv, - []>; + [/* no pattern */]>; + +def MPYSvec: + MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>; + +def MPYSr16: + MPYSInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB)>; // mpyhh: multiply high-high (returns the 32-bit result from multiplying // the top 16 bits of the $rA, $rB) + +class MPYHHInst: + RRForm<0b01100011110, OOL, IOL, + "mpyhh\t$rT, $rA, $rB", IntegerMulDiv, + [/* no pattern */]>; + def MPYHHv8i16: - RRForm<0b01100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - "mpyhh\t$rT, $rA, $rB", IntegerMulDiv, - [(set (v8i16 VECREG:$rT), - (SPUmpyhh_vec (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>; + MPYHHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>; def MPYHHr32: - RRForm<0b01100011110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB), - "mpyhh\t$rT, $rA, $rB", IntegerMulDiv, - []>; + MPYHHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>; // mpyhha: Multiply high-high, add to $rT: -def MPYHHAvec: - RRForm<0b01100010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - "mpyhha\t$rT, $rA, $rB", IntegerMulDiv, - []>; -def MPYHHAr32: - RRForm<0b01100010110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB), +class MPYHHAInst: + RRForm<0b01100010110, OOL, IOL, "mpyhha\t$rT, $rA, $rB", IntegerMulDiv, - []>; + [/* no pattern */]>; + +def MPYHHAvec: + MPYHHAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>; + +def MPYHHAr32: + MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>; // mpyhhu: Multiply high-high, unsigned -def MPYHHUvec: - RRForm<0b01110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv, - []>; -def MPYHHUr32: - RRForm<0b01110011110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB), +class MPYHHUInst: + RRForm<0b01110011110, OOL, IOL, "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv, - []>; + [/* no pattern */]>; + +def MPYHHUvec: + MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>; + +def MPYHHUr32: + MPYHHUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>; // mpyhhau: Multiply high-high, unsigned + +class MPYHHAUInst: + RRForm<0b01110010110, OOL, IOL, + "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv, + [/* no pattern */]>; + def MPYHHAUvec: - RRForm<0b01110010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv, - []>; - + MPYHHAUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>; + def MPYHHAUr32: - RRForm<0b01110010110, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB), - "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv, - []>; - -//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ -// v4i32, i32 multiply instruction sequence: -//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ -def MPYv4i32: - Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)), - (Av4i32 - (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB), - (MPYHv4i32 VECREG:$rB, VECREG:$rA)), - (MPYUv4i32 VECREG:$rA, VECREG:$rB))>; - -def MPYi32: - Pat<(mul R32C:$rA, R32C:$rB), - (Ar32 - (Ar32 (MPYHr32 R32C:$rA, R32C:$rB), - (MPYHr32 R32C:$rB, R32C:$rA)), - (MPYUr32 R32C:$rA, R32C:$rB))>; + MPYHHAUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // clz: Count leading zeroes @@ -983,7 +995,7 @@ class CLZInst pattern>: class CLZRegInst: CLZInst<(outs rclass:$rT), (ins rclass:$rA), - [(set rclass:$rT, (ctlz rclass:$rA))]>; + [(set rclass:$rT, (ctlz rclass:$rA))]>; class CLZVecInst: CLZInst<(outs VECREG:$rT), (ins VECREG:$rA), @@ -1424,7 +1436,7 @@ multiclass BitwiseOr def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB), [/* no pattern */]>; - // scalar->vector promotion: + // scalar->vector promotion, prefslot2vec: def v16i8_i8: ORPromoteScalar; def v8i16_i16: ORPromoteScalar; def v4i32_i32: ORPromoteScalar; @@ -1432,7 +1444,7 @@ multiclass BitwiseOr def v4f32_f32: ORPromoteScalar; def v2f64_f64: ORPromoteScalar; - // extract element 0: + // vector->scalar demotion, vec2prefslot: def i8_v16i8: ORExtractElt; def i16_v8i16: ORExtractElt; def i32_v4i32: ORExtractElt; @@ -1831,6 +1843,13 @@ class SELBVecInst: (and (vnot (vectype VECREG:$rC)), (vectype VECREG:$rA))))]>; +class SELBVecVCondInst: + SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC), + [(set (vectype VECREG:$rT), + (select (vectype VECREG:$rC), + (vectype VECREG:$rB), + (vectype VECREG:$rA)))]>; + class SELBVecCondInst: SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC), [(set (vectype VECREG:$rT), @@ -1867,8 +1886,21 @@ multiclass SelectBits def v4i32_cond: SELBVecCondInst; def v2i64_cond: SELBVecCondInst; + def v16i8_vcond: SELBVecCondInst; + def v8i16_vcond: SELBVecCondInst; + def v4i32_vcond: SELBVecCondInst; + def v2i64_vcond: SELBVecCondInst; + + def v4f32_cond: + SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC), + [(set (v4f32 VECREG:$rT), + (select (v4i32 VECREG:$rC), + (v4f32 VECREG:$rB), + (v4f32 VECREG:$rA)))]>; + // SELBr64_cond is defined further down, look for i64 comparisons def r32_cond: SELBRegCondInst; + def f32_cond: SELBRegCondInst; def r16_cond: SELBRegCondInst; def r8_cond: SELBRegCondInst; } @@ -2454,11 +2486,11 @@ class ROTQBIInst pattern>: RotateShift, pattern>; class ROTQBIVecInst: - ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), + ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), [/* no pattern yet */]>; class ROTQBIRegInst: - ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB), + ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB), [/* no pattern yet */]>; multiclass RotateQuadByBitCount @@ -2645,9 +2677,6 @@ def : Pat<(srl R32C:$rA, (i8 imm:$val)), // ROTQMBYvec: This is a vector form merely so that when used in an // instruction pattern, type checking will succeed. This instruction assumes // that the user knew to negate $rB. -// -// Using the SPUrotquad_rz_bytes target-specific DAG node, the patterns -// ensure that $rB is negated. //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ class ROTQMBYInst pattern>: @@ -2660,8 +2689,7 @@ class ROTQMBYVecInst: class ROTQMBYRegInst: ROTQMBYInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB), - [(set rclass:$rT, - (SPUrotquad_rz_bytes rclass:$rA, R32C:$rB))]>; + [/* no pattern */]>; multiclass RotateQuadBytes { @@ -2676,32 +2704,17 @@ multiclass RotateQuadBytes defm ROTQMBY : RotateQuadBytes; -def : Pat<(SPUrotquad_rz_bytes (v16i8 VECREG:$rA), R32C:$rB), - (ROTQMBYv16i8 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bytes (v8i16 VECREG:$rA), R32C:$rB), - (ROTQMBYv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bytes (v4i32 VECREG:$rA), R32C:$rB), - (ROTQMBYv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bytes (v2i64 VECREG:$rA), R32C:$rB), - (ROTQMBYv2i64 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bytes GPRC:$rA, R32C:$rB), - (ROTQMBYr128 GPRC:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bytes R64C:$rA, R32C:$rB), - (ROTQMBYr64 R64C:$rA, (SFIr32 R32C:$rB, 0))>; - class ROTQMBYIInst pattern>: RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val", RotateShift, pattern>; class ROTQMBYIVecInst: ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val), - [(set (vectype VECREG:$rT), - (SPUrotquad_rz_bytes (vectype VECREG:$rA), (i32 uimm7:$val)))]>; + [/* no pattern */]>; class ROTQMBYIRegInst: ROTQMBYIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val), - [(set rclass:$rT, - (SPUrotquad_rz_bytes rclass:$rA, (inttype pred:$val)))]>; + [/* no pattern */]>; multiclass RotateQuadBytesImm { @@ -2725,8 +2738,8 @@ class ROTQMBYBIInst pattern>: RotateShift, pattern>; class ROTQMBYBIVecInst: - ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - [/* no pattern, intrinsic? */]>; + ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), + [/* no pattern, */]>; multiclass RotateMaskQuadByBitCount { @@ -2768,19 +2781,6 @@ multiclass RotateMaskQuadByBits defm ROTQMBI: RotateMaskQuadByBits; -def : Pat<(SPUrotquad_rz_bits (v16i8 VECREG:$rA), R32C:$rB), - (ROTQMBIv16i8 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bits (v8i16 VECREG:$rA), R32C:$rB), - (ROTQMBIv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bits (v4i32 VECREG:$rA), R32C:$rB), - (ROTQMBIv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bits (v2i64 VECREG:$rA), R32C:$rB), - (ROTQMBIv2i64 VECREG:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bits GPRC:$rA, R32C:$rB), - (ROTQMBIr128 GPRC:$rA, (SFIr32 R32C:$rB, 0))>; -def : Pat<(SPUrotquad_rz_bits R64C:$rA, R32C:$rB), - (ROTQMBIr64 R64C:$rA, (SFIr32 R32C:$rB, 0))>; - //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // Rotate quad and mask by bits, immediate //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ @@ -2791,13 +2791,11 @@ class ROTQMBIIInst pattern>: class ROTQMBIIVecInst: ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val), - [(set (vectype VECREG:$rT), - (SPUrotquad_rz_bits (vectype VECREG:$rA), (i32 uimm7:$val)))]>; + [/* no pattern */]>; class ROTQMBIIRegInst: ROTQMBIIInst<(outs rclass:$rT), (ins rclass:$rA, rotNeg7imm:$val), - [(set rclass:$rT, - (SPUrotquad_rz_bits rclass:$rA, (i32 uimm7:$val)))]>; + [/* no pattern */]>; multiclass RotateMaskQuadByBitsImm { @@ -3142,6 +3140,15 @@ multiclass CmpGtrWordImm def r32: CGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val), [(set R32C:$rT, (setgt R32C:$rA, i32ImmSExt10:$val))]>; + + // CGTIv4f32, CGTIf32: These are used in the f32 fdiv instruction sequence: + def v4f32: CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val), + [(set (v4i32 VECREG:$rT), + (setgt (v4i32 (bitconvert (v4f32 VECREG:$rA))), + (v4i32 v4i32SExt16Imm:$val)))]>; + + def f32: CGTIInst<(outs R32C:$rT), (ins R32FP:$rA, s10imm_i32:$val), + [/* no pattern */]>; } class CLGTBInst pattern> : @@ -3750,62 +3757,63 @@ let isTerminator = 1, isBarrier = 1 in { class FAInst pattern>: RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB", - SPrecFP, pattern>; + SPrecFP, pattern>; class FAVecInst: FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), [(set (vectype VECREG:$rT), - (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>; + (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>; multiclass SFPAdd { def v4f32: FAVecInst; - def r32: FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB), - [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>; + def f32: FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB), + [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>; } defm FA : SFPAdd; class FSInst pattern>: RRForm<0b01011000100, OOL, IOL, "fs\t$rT, $rA, $rB", - SPrecFP, pattern>; + SPrecFP, pattern>; class FSVecInst: FSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), - [(set (vectype VECREG:$rT), - (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>; + [(set (vectype VECREG:$rT), + (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>; multiclass SFPSub { def v4f32: FSVecInst; - def r32: FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB), - [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>; + def f32: FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB), + [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>; } defm FS : SFPSub; // Floating point reciprocal estimate -def FREv4f32 : - RRForm_1<0b00011101100, (outs VECREG:$rT), (ins VECREG:$rA), - "frest\t$rT, $rA", SPrecFP, - [(set (v4f32 VECREG:$rT), (SPUreciprocalEst (v4f32 VECREG:$rA)))]>; -def FREf32 : - RRForm_1<0b00011101100, (outs R32FP:$rT), (ins R32FP:$rA), - "frest\t$rT, $rA", SPrecFP, - [(set R32FP:$rT, (SPUreciprocalEst R32FP:$rA))]>; +class FRESTInst: + RRForm_1<0b00110111000, OOL, IOL, + "frest\t$rT, $rA", SPrecFP, + [/* no pattern */]>; + +def FRESTv4f32 : + FRESTInst<(outs VECREG:$rT), (ins VECREG:$rA)>; + +def FRESTf32 : + FRESTInst<(outs R32FP:$rT), (ins R32FP:$rA)>; // Floating point interpolate (used in conjunction with reciprocal estimate) def FIv4f32 : RRForm<0b00101011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB), "fi\t$rT, $rA, $rB", SPrecFP, - [(set (v4f32 VECREG:$rT), (SPUinterpolate (v4f32 VECREG:$rA), - (v4f32 VECREG:$rB)))]>; + [/* no pattern */]>; def FIf32 : RRForm<0b00101011110, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB), "fi\t$rT, $rA, $rB", SPrecFP, - [(set R32FP:$rT, (SPUinterpolate R32FP:$rA, R32FP:$rB))]>; + [/* no pattern */]>; //-------------------------------------------------------------------------- // Basic single precision floating point comparisons: @@ -4445,12 +4453,14 @@ def : Pat<(SPUindirect (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)), (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>; +/* def : Pat<(SPUindirect R32C:$sp, i32ImmSExt10:$imm), (AIr32 R32C:$sp, i32ImmSExt10:$imm)>; def : Pat<(SPUindirect R32C:$sp, imm:$imm), (Ar32 R32C:$sp, (IOHLr32 (ILHUr32 (HI16 imm:$imm)), (LO16 imm:$imm)))>; + */ def : Pat<(add (SPUhi tglobaladdr:$in, 0), (SPUlo tglobaladdr:$in, 0)), (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>; @@ -4466,5 +4476,7 @@ def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)), // Instrinsics: include "CellSDKIntrinsics.td" +// Various math operator instruction sequences +include "SPUMathInstr.td" // 64-bit "instructions"/support include "SPU64InstrInfo.td" diff --git a/lib/Target/CellSPU/SPUMathInstr.td b/lib/Target/CellSPU/SPUMathInstr.td new file mode 100644 index 00000000000..38279a0a9f8 --- /dev/null +++ b/lib/Target/CellSPU/SPUMathInstr.td @@ -0,0 +1,99 @@ +//======--- SPUMathInst.td - Cell SPU math operations -*- tablegen -*---======// +// +// Cell SPU math operations +// +// This target description file contains instruction sequences for various +// math operations, such as vector multiplies, i32 multiply, etc., for the +// SPU's i32, i16 i8 and corresponding vector types. +// +// Any resemblance to libsimdmath or the Cell SDK simdmath library is +// purely and completely coincidental. +// +// Primary author: Scott Michel (scottm@aero.org) +//===----------------------------------------------------------------------===// + +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ +// v16i8 multiply instruction sequence: +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ + +def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)), + (ORv4i32 + (ANDv4i32 + (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB), + (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8), + (ROTMAHIv8i16 VECREG:$rB, 8)), 8), + (FSMBIv8i16 0x2222)), + (ILAv4i32 0x0000ffff)), + (SHLIv4i32 + (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16), + (ROTMAIv4i32_i32 VECREG:$rB, 16)), + (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8), + (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8), + (FSMBIv8i16 0x2222)), 16))>; + +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ +// v8i16 multiply instruction sequence: +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ + +def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)), + (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB), + (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16), + (FSMBIv8i16 0xcccc))>; + +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ +// v4i32, i32 multiply instruction sequence: +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ + +def MPYv4i32: + Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)), + (Av4i32 + (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB), + (MPYHv4i32 VECREG:$rB, VECREG:$rA)), + (MPYUv4i32 VECREG:$rA, VECREG:$rB))>; + +def MPYi32: + Pat<(mul R32C:$rA, R32C:$rB), + (Ar32 + (Ar32 (MPYHr32 R32C:$rA, R32C:$rB), + (MPYHr32 R32C:$rB, R32C:$rA)), + (MPYUr32 R32C:$rA, R32C:$rB))>; + +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ +// f32, v4f32 divide instruction sequence: +//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ + +// Reciprocal estimate and interpolation +def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>; +// Division estimate +def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>; +// Newton-Raphson iteration +def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA), + Interpf32.Fragment, + DivEstf32.Fragment)>; +// Epsilon addition +def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>; + +def : Pat<(fdiv R32FP:$rA, R32FP:$rB), + (SELBf32_cond NRaphf32.Fragment, + Epsilonf32.Fragment, + (CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>; + +// Reciprocal estimate and interpolation +def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>; +// Division estimate +def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>; +// Newton-Raphson iteration +def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment, + (v4f32 VECREG:$rB), + (v4f32 VECREG:$rA)), + Interpv4f32.Fragment, + DivEstv4f32.Fragment)>; +// Epsilon addition +def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>; + +def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)), + (SELBv4f32_cond NRaphv4f32.Fragment, + Epsilonv4f32.Fragment, + (CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB), + Epsilonv4f32.Fragment, + (v4f32 VECREG:$rA)), -1))>; diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td index 5cf229e4b78..89a52eedb18 100644 --- a/lib/Target/CellSPU/SPUNodes.td +++ b/lib/Target/CellSPU/SPUNodes.td @@ -87,24 +87,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>; // SPUISelLowering.h): def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>; -// SPU 16-bit multiply -def SPUmpy_vec: SDNode<"SPUISD::MPY", SPUVecBinop, []>; - -// SPU multiply unsigned, used in instruction lowering for v4i32 -// multiplies: -def SPUmpyu_vec: SDNode<"SPUISD::MPYU", SPUVecBinop, []>; -def SPUmpyu_int: SDNode<"SPUISD::MPYU", SDTIntBinOp, []>; - -// SPU 16-bit multiply high x low, shift result 16-bits -// Used to compute intermediate products for 32-bit multiplies -def SPUmpyh_vec: SDNode<"SPUISD::MPYH", SPUVecBinop, []>; -def SPUmpyh_int: SDNode<"SPUISD::MPYH", SDTIntBinOp, []>; - -// SPU 16-bit multiply high x high, 32-bit product -// Used to compute intermediate products for 16-bit multiplies -def SPUmpyhh_vec: SDNode<"SPUISD::MPYHH", SPUVecBinop, []>; -def SPUmpyhh_int: SDNode<"SPUISD::MPYHH", SDTIntBinOp, []>; - // Shift left quadword by bits and bytes def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>; def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>; @@ -117,11 +99,6 @@ def SPUvec_sra: SDNode<"SPUISD::VEC_SRA", SPUvecshift_type, []>; def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>; def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>; -def SPUrotquad_rz_bytes: SDNode<"SPUISD::ROTQUAD_RZ_BYTES", - SPUvecshift_type, []>; -def SPUrotquad_rz_bits: SDNode<"SPUISD::ROTQUAD_RZ_BITS", - SPUvecshift_type, []>; - // Vector rotate left, bits shifted out of the left are rotated in on the right def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT", SPUvecshift_type, []>; @@ -141,12 +118,6 @@ def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>; // SPU gather bits instruction: def SPUgatherbits: SDNode<"SPUISD::GATHER_BITS", SPUgatherbits_type, []>; -// SPU floating point interpolate -def SPUinterpolate : SDNode<"SPUISD::FPInterp", SDTFPBinOp, []>; - -// SPU floating point reciprocal estimate (used for fdiv) -def SPUreciprocalEst: SDNode<"SPUISD::FPRecipEst", SDTFPUnaryOp, []>; - def SDTprefslot2vec: SDTypeProfile<1, 1, []>; def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>; diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp index cf4089fa29e..381522dac54 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.cpp +++ b/lib/Target/CellSPU/SPURegisterInfo.cpp @@ -238,7 +238,7 @@ SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const SPU::R0, /* link register */ 0 /* end */ }; - + return SPU_CalleeSaveRegs; } @@ -268,7 +268,7 @@ SPURegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const &SPU::GPRCRegClass, /* link register */ 0 /* end */ }; - + return SPU_CalleeSaveRegClasses; } @@ -339,10 +339,13 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, // Now add the frame object offset to the offset from r1. int Offset = MFI->getObjectOffset(FrameIndex); - // Most instructions, except for generated FrameIndex additions using AIr32, - // have the immediate in operand 1. AIr32, in this case, has the immediate - // in operand 2. - unsigned OpNo = (MI.getOpcode() != SPU::AIr32 ? 1 : 2); + // Most instructions, except for generated FrameIndex additions using AIr32 + // and ILAr32, have the immediate in operand 1. AIr32 and ILAr32 have the + // immediate in operand 2. + unsigned OpNo = 1; + if (MI.getOpcode() == SPU::AIr32 || MI.getOpcode() == SPU::ILAr32) + OpNo = 2; + MachineOperand &MO = MI.getOperand(OpNo); // Offset is biased by $lr's slot at the bottom. @@ -355,7 +358,7 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, if (Offset > SPUFrameInfo::maxFrameOffset() || Offset < SPUFrameInfo::minFrameOffset()) { cerr << "Large stack adjustment (" - << Offset + << Offset << ") in SPURegisterInfo::eliminateFrameIndex."; } else { MO.ChangeToImmediate(Offset); @@ -371,7 +374,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const // Get the number of bytes to allocate from the FrameInfo unsigned FrameSize = MFI->getStackSize(); - + // Get the alignments provided by the target, and the maximum alignment // (if any) of the fixed frame objects. unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); @@ -381,7 +384,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const // Get the maximum call frame size of all the calls. unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); - + // If we have dynamic alloca then maxCallFrameSize needs to be aligned so // that allocations will be aligned. if (MFI->hasVarSizedObjects()) @@ -389,7 +392,7 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const // Update maximum call frame size. MFI->setMaxCallFrameSize(maxCallFrameSize); - + // Include call frame size in total. FrameSize += maxCallFrameSize; @@ -418,18 +421,18 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); - + // Prepare for debug frame info. bool hasDebugInfo = MMI && MMI->hasDebugInfo(); unsigned FrameLabelId = 0; - + // Move MBBI back to the beginning of the function. MBBI = MBB.begin(); - + // Work out frame sizes. determineFrameLayout(MF); int FrameSize = MFI->getStackSize(); - + assert((FrameSize & 0xf) == 0 && "SPURegisterInfo::emitPrologue: FrameSize not aligned"); @@ -440,7 +443,7 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const FrameLabelId = MMI->NextLabelID(); BuildMI(MBB, MBBI, TII.get(SPU::DBG_LABEL)).addImm(FrameLabelId); } - + // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp) // for the ABI BuildMI(MBB, MBBI, TII.get(SPU::STQDr32), SPU::R0).addImm(16) @@ -476,15 +479,15 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const cerr << "Unhandled frame size: " << FrameSize << "\n"; abort(); } - + if (hasDebugInfo) { std::vector &Moves = MMI->getFrameMoves(); - + // Show update of SP. MachineLocation SPDst(MachineLocation::VirtualFP); MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize); Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc)); - + // Add callee saved registers to move list. const std::vector &CSI = MFI->getCalleeSavedInfo(); for (unsigned I = 0, E = CSI.size(); I != E; ++I) { @@ -495,11 +498,11 @@ void SPURegisterInfo::emitPrologue(MachineFunction &MF) const MachineLocation CSSrc(Reg); Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc)); } - + // Mark effective beginning of when frame pointer is ready. unsigned ReadyLabelId = MMI->NextLabelID(); BuildMI(MBB, MBBI, TII.get(SPU::DBG_LABEL)).addImm(ReadyLabelId); - + MachineLocation FPDst(SPU::R1); MachineLocation FPSrc(MachineLocation::VirtualFP); Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc)); diff --git a/test/CodeGen/CellSPU/fdiv.ll b/test/CodeGen/CellSPU/fdiv.ll index 826a2faaabf..d121c3f8c90 100644 --- a/test/CodeGen/CellSPU/fdiv.ll +++ b/test/CodeGen/CellSPU/fdiv.ll @@ -1,9 +1,11 @@ ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s ; RUN: grep frest %t1.s | count 2 ; RUN: grep -w fi %t1.s | count 2 -; RUN: grep fm %t1.s | count 4 +; RUN: grep -w fm %t1.s | count 2 ; RUN: grep fma %t1.s | count 2 -; RUN: grep fnms %t1.s | count 2 +; RUN: grep fnms %t1.s | count 4 +; RUN: grep cgti %t1.s | count 2 +; RUN: grep selb %t1.s | count 2 ; ; This file includes standard floating point arithmetic instructions target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128" diff --git a/test/CodeGen/CellSPU/i64ops.ll b/test/CodeGen/CellSPU/i64ops.ll index 5e7897bc971..51abd44a09e 100644 --- a/test/CodeGen/CellSPU/i64ops.ll +++ b/test/CodeGen/CellSPU/i64ops.ll @@ -1,8 +1,5 @@ ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s -; RUN: grep {fsmbi.*61680} %t1.s | count 1 -; RUN: grep rotqmbyi %t1.s | count 1 -; RUN: grep rotmai %t1.s | count 1 -; RUN: grep selb %t1.s | count 1 +; RUN: grep xswd %t1.s | count 1 ; RUN: grep shufb %t1.s | count 2 ; RUN: grep cg %t1.s | count 1 ; RUN: grep addx %t1.s | count 1 diff --git a/test/CodeGen/CellSPU/mul_ops.ll b/test/CodeGen/CellSPU/mul_ops.ll index 843505f1359..085ce555dc2 100644 --- a/test/CodeGen/CellSPU/mul_ops.ll +++ b/test/CodeGen/CellSPU/mul_ops.ll @@ -8,7 +8,7 @@ ; RUN: grep and %t1.s | count 2 ; RUN: grep selb %t1.s | count 6 ; RUN: grep fsmbi %t1.s | count 4 -; RUN: grep shli %t1.s | count 2 +; RUN: grep shli %t1.s | count 4 ; RUN: grep shlhi %t1.s | count 4 ; RUN: grep ila %t1.s | count 2 ; RUN: grep xsbh %t1.s | count 4 diff --git a/test/CodeGen/CellSPU/shift_ops.ll b/test/CodeGen/CellSPU/shift_ops.ll index b6629cac2a1..5b60dc178fa 100644 --- a/test/CodeGen/CellSPU/shift_ops.ll +++ b/test/CodeGen/CellSPU/shift_ops.ll @@ -1,10 +1,21 @@ ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s -; RUN: grep shlh %t1.s | count 84 -; RUN: grep shlhi %t1.s | count 51 -; RUN: grep shl %t1.s | count 168 -; RUN: grep shli %t1.s | count 51 -; RUN: grep xshw %t1.s | count 5 -; RUN: grep and %t1.s | count 5 +; RUN: grep -w shlh %t1.s | count 9 +; RUN: grep -w shlhi %t1.s | count 3 +; RUN: grep -w shl %t1.s | count 9 +; RUN: grep -w shli %t1.s | count 3 +; RUN: grep -w xshw %t1.s | count 5 +; RUN: grep -w and %t1.s | count 5 +; RUN: grep -w andi %t1.s | count 2 +; RUN: grep -w rotmi %t1.s | count 2 +; RUN: grep -w rotqmbyi %t1.s | count 1 +; RUN: grep -w rotqmbii %t1.s | count 2 +; RUN: grep -w rotqmby %t1.s | count 1 +; RUN: grep -w rotqmbi %t1.s | count 1 +; RUN: grep -w rotqbyi %t1.s | count 1 +; RUN: grep -w rotqbii %t1.s | count 2 +; RUN: grep -w rotqbybi %t1.s | count 1 +; RUN: grep -w sfi %t1.s | count 3 + target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128" target triple = "spu" @@ -210,3 +221,57 @@ define i32 @shli_i32_12(i32 zeroext %arg1) zeroext { %A = shl i32 0, %arg1 ret i32 %A } + +;; i64 shift left + +define i64 @shl_i64_1(i64 %arg1) { + %A = shl i64 %arg1, 9 + ret i64 %A +} + +define i64 @shl_i64_2(i64 %arg1) { + %A = shl i64 %arg1, 3 + ret i64 %A +} + +define i64 @shl_i64_3(i64 %arg1, i32 %shift) { + %1 = zext i32 %shift to i64 + %2 = shl i64 %arg1, %1 + ret i64 %2 +} + +;; i64 shift right logical (shift 0s from the right) + +define i64 @lshr_i64_1(i64 %arg1) { + %1 = lshr i64 %arg1, 9 + ret i64 %1 +} + +define i64 @lshr_i64_2(i64 %arg1) { + %1 = lshr i64 %arg1, 3 + ret i64 %1 +} + +define i64 @lshr_i64_3(i64 %arg1, i32 %shift) { + %1 = zext i32 %shift to i64 + %2 = lshr i64 %arg1, %1 + ret i64 %2 +} + +;; i64 shift right arithmetic (shift 1s from the right) + +define i64 @ashr_i64_1(i64 %arg) { + %1 = ashr i64 %arg, 9 + ret i64 %1 +} + +define i64 @ashr_i64_2(i64 %arg) { + %1 = ashr i64 %arg, 3 + ret i64 %1 +} + +define i64 @ashr_i64_3(i64 %arg1, i32 %shift) { + %1 = zext i32 %shift to i64 + %2 = ashr i64 %arg1, %1 + ret i64 %2 +} diff --git a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c index 7b86070095f..3819797d148 100644 --- a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c +++ b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c @@ -34,19 +34,45 @@ struct pred_s preds[] = { { "neq", i64_neq, i64_neq_select } }; +uint64_t i64_shl_const(uint64_t a) { + return a << 10; +} + +uint64_t i64_shl(uint64_t a, int amt) { + return a << amt; +} + +uint64_t i64_srl_const(uint64_t a) { + return a >> 10; +} + +uint64_t i64_srl(uint64_t a, int amt) { + return a >> amt; +} + +int64_t i64_sra_const(int64_t a) { + return a >> 10; +} + +int64_t i64_sra(int64_t a, int amt) { + return a >> amt; +} + int main(void) { int i; - int64_t a = 1234567890000LL; - int64_t b = 2345678901234LL; - int64_t c = 1234567890001LL; - int64_t d = 10001LL; - int64_t e = 10000LL; + int64_t a = 1234567890003LL; + int64_t b = 2345678901235LL; + int64_t c = 1234567890001LL; + int64_t d = 10001LL; + int64_t e = 10000LL; + int64_t f = -1068103409991LL; printf("a = %16lld (0x%016llx)\n", a, a); printf("b = %16lld (0x%016llx)\n", b, b); printf("c = %16lld (0x%016llx)\n", c, c); printf("d = %16lld (0x%016llx)\n", d, d); printf("e = %16lld (0x%016llx)\n", e, e); + printf("f = %16lld (0x%016llx)\n", f, f); printf("----------------------------------------\n"); for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) { @@ -64,5 +90,23 @@ int main(void) { printf("----------------------------------------\n"); } + printf("a = 0x%016llx\n", a); + printf("i64_shl_const(a) = 0x%016llx\n", i64_shl_const(a)); + printf("i64_shl(a) = 0x%016llx\n", i64_shl(a, 5)); + printf("i64_srl_const(a) = 0x%016llx\n", i64_srl_const(a)); + printf("i64_srl(a) = 0x%016llx\n", i64_srl(a, 5)); + printf("i64_sra_const(a) = 0x%016llx\n", i64_sra_const(a)); + printf("i64_sra(a) = 0x%016llx\n", i64_sra(a, 5)); + printf("----------------------------------------\n"); + + printf("f = 0x%016llx\n", f); + printf("i64_shl_const(f) = 0x%016llx\n", i64_shl_const(f)); + printf("i64_shl(f) = 0x%016llx\n", i64_shl(f, 10)); + printf("i64_srl_const(f) = 0x%016llx\n", i64_srl_const(f)); + printf("i64_srl(f) = 0x%016llx\n", i64_srl(f, 10)); + printf("i64_sra_const(f) = 0x%016llx\n", i64_sra_const(f)); + printf("i64_sra(f) = 0x%016llx\n", i64_sra(f, 10)); + printf("----------------------------------------\n"); + return 0; }