From 7a1c9e9cb7822afcf86b6575a3187974beecd9c6 Mon Sep 17 00:00:00 2001 From: Scott Michel Date: Sat, 22 Nov 2008 23:50:42 +0000 Subject: [PATCH] CellSPU: Fix bug 3056. Varadic extract_element was not implemented (nor was it ever conceived to occur). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@59891 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/CellSPU/SPUISelDAGToDAG.cpp | 19 +- lib/Target/CellSPU/SPUISelLowering.cpp | 332 ++++++++++++++++++------- lib/Target/CellSPU/SPUISelLowering.h | 2 +- lib/Target/CellSPU/SPUInstrInfo.td | 53 ++-- lib/Target/CellSPU/SPUNodes.td | 2 +- test/CodeGen/CellSPU/extract_elt.ll | 114 ++++++++- 6 files changed, 408 insertions(+), 114 deletions(-) diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index 109cd5ee1ee..4fbd5bb467a 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -591,13 +591,24 @@ SPUDAGToDAGISel::SelectXFormAddr(SDValue Op, SDValue N, SDValue &Base, SDValue &Index) { if (!SelectAFormAddr(Op, N, Base, Index) && !SelectDFormAddr(Op, N, Base, Index)) { - // default form of a X-form address is r(r) in operands 0 and 1: + // Default form of a X-form address is r(r) in operands 0 and 1: SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); - if (Op0.getOpcode() == ISD::Register && Op1.getOpcode() == ISD::Register) { - Base = Op0; - Index = Op1; + if ((Op0.getOpcode() == ISD::Register + || Op.getOpcode() == ISD::CopyFromReg) + && (Op1.getOpcode() == ISD::Register + || Op.getOpcode() == ISD::CopyFromReg)) { + if (Op.getOpcode() == ISD::Register) + Base = Op0; + else + Base = Op0.getOperand(1); + + if (Op1.getOpcode() == ISD::Register) + Index = Op1; + else + Index = Op1.getOperand(1); + return true; } } diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 9f828b410b6..d44da756530 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -39,8 +39,8 @@ namespace { //! MVT mapping to useful data for Cell SPU struct valtype_map_s { - const MVT valtype; - const int prefslot_byte; + const MVT valtype; + const int prefslot_byte; }; const valtype_map_s valtype_map[] = { @@ -171,7 +171,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) // Expand the jumptable branches setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::Other, Expand); + + // Custom lower SELECT_CC for most cases, but expand by default setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i8, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i16, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); // SPU has no intrinsics for these particular operations: setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); @@ -398,6 +404,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setTargetDAGCombine(ISD::ANY_EXTEND); computeRegisterProperties(); + + // Set other properties: + setSchedulingPreference(SchedulingForLatency); } const char * @@ -413,7 +422,7 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT"; node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL"; node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB"; - node_names[(unsigned) SPUISD::INSERT_MASK] = "SPUISD::INSERT_MASK"; + node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK"; node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR"; node_names[(unsigned) SPUISD::EXTRACT_ELT0] = "SPUISD::EXTRACT_ELT0"; @@ -750,7 +759,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { } SDValue insertEltOp = - DAG.getNode(SPUISD::INSERT_MASK, stVecVT, insertEltPtr); + DAG.getNode(SPUISD::SHUFFLE_MASK, stVecVT, insertEltPtr); SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue); @@ -1720,11 +1729,11 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { /// which the Cell can operate. The code inspects V3 to ascertain whether the /// permutation vector, V3, is monotonically increasing with one "exception" /// element, e.g., (0, 1, _, 3). If this is the case, then generate a -/// INSERT_MASK synthetic instruction. Otherwise, spill V3 to the constant pool. +/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool. /// In either case, the net result is going to eventually invoke SHUFB to /// permute/shuffle the bytes from V1 and V2. /// \note -/// INSERT_MASK is eventually selected as one of the C*D instructions, generate +/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate /// control word for byte/halfword/word insertion. This takes care of a single /// element move from V2 into V1. /// \note @@ -1782,9 +1791,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // Initialize temporary register to 0 SDValue InitTempReg = DAG.getCopyToReg(DAG.getEntryNode(), VReg, DAG.getConstant(0, PtrVT)); - // Copy register's contents as index in INSERT_MASK: + // Copy register's contents as index in SHUFFLE_MASK: SDValue ShufMaskOp = - DAG.getNode(SPUISD::INSERT_MASK, V1.getValueType(), + DAG.getNode(SPUISD::SHUFFLE_MASK, V1.getValueType(), DAG.getTargetConstant(V2Elt, MVT::i32), DAG.getCopyFromReg(InitTempReg, VReg, PtrVT)); // Use shuffle mask in SHUFB synthetic instruction: @@ -2050,82 +2059,200 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getValueType(); SDValue N = Op.getOperand(0); SDValue Elt = Op.getOperand(1); - SDValue ShufMask[16]; - ConstantSDNode *C = dyn_cast(Elt); + SDValue retval; - assert(C != 0 && "LowerEXTRACT_VECTOR_ELT expecting constant SDNode"); + if (ConstantSDNode *C = dyn_cast(Elt)) { + // Constant argument: + int EltNo = (int) C->getZExtValue(); - int EltNo = (int) C->getZExtValue(); + // sanity checks: + if (VT == MVT::i8 && EltNo >= 16) + assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15"); + else if (VT == MVT::i16 && EltNo >= 8) + assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7"); + else if (VT == MVT::i32 && EltNo >= 4) + assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4"); + else if (VT == MVT::i64 && EltNo >= 2) + assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2"); - // sanity checks: - if (VT == MVT::i8 && EltNo >= 16) - assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15"); - else if (VT == MVT::i16 && EltNo >= 8) - assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7"); - else if (VT == MVT::i32 && EltNo >= 4) - assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4"); - else if (VT == MVT::i64 && EltNo >= 2) - assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2"); + if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) { + // i32 and i64: Element 0 is the preferred slot + return DAG.getNode(SPUISD::EXTRACT_ELT0, VT, N); + } - if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) { - // i32 and i64: Element 0 is the preferred slot - return DAG.getNode(SPUISD::EXTRACT_ELT0, VT, N); + // Need to generate shuffle mask and extract: + int prefslot_begin = -1, prefslot_end = -1; + int elt_byte = EltNo * VT.getSizeInBits() / 8; + + switch (VT.getSimpleVT()) { + default: + assert(false && "Invalid value type!"); + case MVT::i8: { + prefslot_begin = prefslot_end = 3; + break; + } + case MVT::i16: { + prefslot_begin = 2; prefslot_end = 3; + break; + } + case MVT::i32: + case MVT::f32: { + prefslot_begin = 0; prefslot_end = 3; + break; + } + case MVT::i64: + case MVT::f64: { + prefslot_begin = 0; prefslot_end = 7; + break; + } + } + + assert(prefslot_begin != -1 && prefslot_end != -1 && + "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized"); + + unsigned int ShufBytes[16]; + for (int i = 0; i < 16; ++i) { + // zero fill uppper part of preferred slot, don't care about the + // other slots: + unsigned int mask_val; + if (i <= prefslot_end) { + mask_val = + ((i < prefslot_begin) + ? 0x80 + : elt_byte + (i - prefslot_begin)); + + ShufBytes[i] = mask_val; + } else + ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)]; + } + + SDValue ShufMask[4]; + for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) { + unsigned bidx = i / 4; + unsigned int bits = ((ShufBytes[bidx] << 24) | + (ShufBytes[bidx+1] << 16) | + (ShufBytes[bidx+2] << 8) | + ShufBytes[bidx+3]); + ShufMask[i] = DAG.getConstant(bits, MVT::i32); + } + + SDValue ShufMaskVec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, + &ShufMask[0], + sizeof(ShufMask) / sizeof(ShufMask[0])); + + retval = DAG.getNode(SPUISD::EXTRACT_ELT0, VT, + DAG.getNode(SPUISD::SHUFB, N.getValueType(), + N, N, ShufMaskVec)); + } else { + // Variable index: Rotate the requested element into slot 0, then replicate + // slot 0 across the vector + MVT VecVT = N.getValueType(); + if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) { + cerr << "LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit vector type!\n"; + abort(); + } + + // Make life easier by making sure the index is zero-extended to i32 + if (Elt.getValueType() != MVT::i32) + Elt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Elt); + + // Scale the index to a bit/byte shift quantity + APInt scaleFactor = + APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false); + SDValue vecShift; + + switch (VT.getSimpleVT()) { + default: + cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n"; + abort(); + /*NOTREACHED*/ + case MVT::i8: { + // Don't need to scale, but we do need to correct for where bytes go in + // slot 0: + SDValue prefSlot = DAG.getNode(ISD::SUB, MVT::i32, + Elt, DAG.getConstant(3, MVT::i32)); + SDValue corrected = DAG.getNode(ISD::ADD, MVT::i32, prefSlot, + DAG.getConstant(16, MVT::i32)); + + SDValue shiftAmt = DAG.getNode(ISD::SELECT_CC, MVT::i32, + prefSlot, DAG.getConstant(0, MVT::i32), + prefSlot, // trueval + corrected, // falseval + DAG.getCondCode(ISD::SETGT)); + vecShift = DAG.getNode(SPUISD::ROTBYTES_LEFT, VecVT, N, shiftAmt); + break; + } + case MVT::i16: { + // Scale the index to bytes, subtract for preferred slot: + Elt = DAG.getNode(ISD::SHL, MVT::i32, Elt, + DAG.getConstant(scaleFactor.logBase2(), MVT::i32)); + SDValue prefSlot = DAG.getNode(ISD::SUB, MVT::i32, + Elt, DAG.getConstant(2, MVT::i32)); + SDValue corrected = DAG.getNode(ISD::ADD, MVT::i32, prefSlot, + DAG.getConstant(16, MVT::i32)); + + SDValue shiftAmt = DAG.getNode(ISD::SELECT_CC, MVT::i32, + prefSlot, DAG.getConstant(0, MVT::i32), + prefSlot, // trueval + corrected, // falseval + DAG.getCondCode(ISD::SETGT)); + vecShift = DAG.getNode(SPUISD::ROTBYTES_LEFT, VecVT, N, shiftAmt); + break; + } + case MVT::i32: + case MVT::f32: + case MVT::i64: + case MVT::f64: + // Simple left shift to slot 0 + Elt = DAG.getNode(ISD::SHL, MVT::i32, Elt, + DAG.getConstant(scaleFactor.logBase2(), MVT::i32)); + vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT, N, Elt); + break; + } + + // Replicate slot 0 across the entire vector (for consistency with the + // notion of a unified register set) + SDValue replicate; + + switch (VT.getSimpleVT()) { + default: + cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n"; + abort(); + /*NOTREACHED*/ + case MVT::i8: { + SDValue factor = DAG.getConstant(0x03030303, MVT::i32); + replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor, + factor, factor); + break; + } + case MVT::i16: { + SDValue factor = DAG.getConstant(0x02030203, MVT::i32); + replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor, + factor, factor); + break; + } + case MVT::i32: + case MVT::f32: { + SDValue factor = DAG.getConstant(0x00010203, MVT::i32); + replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor, + factor, factor); + break; + } + case MVT::i64: + case MVT::f64: { + SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32); + SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32); + replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, loFactor, hiFactor, + loFactor, hiFactor); + break; + } + } + + retval = DAG.getNode(SPUISD::EXTRACT_ELT0, VT, + DAG.getNode(SPUISD::SHUFB, VecVT, vecShift, vecShift, replicate)); } - // Need to generate shuffle mask and extract: - int prefslot_begin = -1, prefslot_end = -1; - int elt_byte = EltNo * VT.getSizeInBits() / 8; - - switch (VT.getSimpleVT()) { - default: - assert(false && "Invalid value type!"); - case MVT::i8: { - prefslot_begin = prefslot_end = 3; - break; - } - case MVT::i16: { - prefslot_begin = 2; prefslot_end = 3; - break; - } - case MVT::i32: - case MVT::f32: { - prefslot_begin = 0; prefslot_end = 3; - break; - } - case MVT::i64: - case MVT::f64: { - prefslot_begin = 0; prefslot_end = 7; - break; - } - } - - assert(prefslot_begin != -1 && prefslot_end != -1 && - "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized"); - - for (int i = 0; i < 16; ++i) { - // zero fill uppper part of preferred slot, don't care about the - // other slots: - unsigned int mask_val; - if (i <= prefslot_end) { - mask_val = - ((i < prefslot_begin) - ? 0x80 - : elt_byte + (i - prefslot_begin)); - - ShufMask[i] = DAG.getConstant(mask_val, MVT::i8); - } else - ShufMask[i] = ShufMask[i % (prefslot_end + 1)]; - } - - SDValue ShufMaskVec = - DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, - &ShufMask[0], - sizeof(ShufMask) / sizeof(ShufMask[0])); - - return DAG.getNode(SPUISD::EXTRACT_ELT0, VT, - DAG.getNode(SPUISD::SHUFB, N.getValueType(), - N, N, ShufMaskVec)); - + return retval; } static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { @@ -2145,7 +2272,7 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getNode(SPUISD::SHUFB, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, ValOp), VecOp, - DAG.getNode(SPUISD::INSERT_MASK, VT, + DAG.getNode(SPUISD::SHUFFLE_MASK, VT, DAG.getNode(ISD::ADD, PtrVT, PtrBase, DAG.getConstant(CN->getZExtValue(), @@ -2614,8 +2741,39 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) { return SDValue(); } -/// LowerOperation - Provide custom lowering hooks for some operations. -/// +//! Lower ISD::SELECT_CC +/*! + ISD::SELECT_CC can (generally) be implemented directly on the SPU using the + SELB instruction. + + \note Need to revisit this in the future: if the code path through the true + and false value computations is longer than the latency of a branch (6 + cycles), then it would be more advantageous to branch and insert a new basic + block and branch on the condition. However, this code does not make that + assumption, given the simplisitc uses so far. + */ + +static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getValueType(); + SDValue lhs = Op.getOperand(0); + SDValue rhs = Op.getOperand(1); + SDValue trueval = Op.getOperand(2); + SDValue falseval = Op.getOperand(3); + SDValue condition = Op.getOperand(4); + + // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's + // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up + // with another "cannot select select_cc" assert: + + SDValue compare = DAG.getNode(ISD::SETCC, VT, lhs, rhs, condition); + return DAG.getNode(SPUISD::SELB, VT, trueval, falseval, compare); +} + +//! Custom (target-specific) lowering entry point +/*! + This is where LLVM's DAG selection process calls to do target-specific + lowering of nodes. + */ SDValue SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { @@ -2704,13 +2862,19 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::FDIV: if (VT == MVT::f32 || VT == MVT::v4f32) return LowerFDIVf32(Op, DAG); -// else if (Op.getValueType() == MVT::f64) -// return LowerFDIVf64(Op, DAG); +#if 0 + // This is probably a libcall + else if (Op.getValueType() == MVT::f64) + return LowerFDIVf64(Op, DAG); +#endif else assert(0 && "Calling FDIV on unsupported MVT"); case ISD::CTPOP: return LowerCTPOP(Op, DAG); + + case ISD::SELECT_CC: + return LowerSELECT_CC(Op, DAG); } return SDValue(); @@ -2967,7 +3131,7 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, #if 0 case CALL: case SHUFB: - case INSERT_MASK: + case SHUFFLE_MASK: case CNTB: #endif diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index 1be376ce31b..d6fb3f80010 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -37,7 +37,7 @@ namespace llvm { LDRESULT, ///< Load result (value, chain) CALL, ///< CALL instruction SHUFB, ///< Vector shuffle (permute) - INSERT_MASK, ///< Insert element shuffle mask + SHUFFLE_MASK, ///< Shuffle mask CNTB, ///< Count leading ones in bytes PROMOTE_SCALAR, ///< Promote scalar->vector EXTRACT_ELT0, ///< Extract element 0 diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index 990865df04e..a6ab49aa85a 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -272,51 +272,51 @@ def STQR : RI16Form<0b111000100, (outs), (ins VECREG:$rT, s16imm:$disp), def CBD : RI7Form<0b10101111100, (outs VECREG:$rT), (ins memri7:$src), "cbd\t$rT, $src", ShuffleOp, - [(set (v16i8 VECREG:$rT), (SPUvecinsmask dform2_addr:$src))]>; + [(set (v16i8 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>; def CBX : RRForm<0b00101011100, (outs VECREG:$rT), (ins memrr:$src), "cbx\t$rT, $src", ShuffleOp, - [(set (v16i8 VECREG:$rT), (SPUvecinsmask xform_addr:$src))]>; + [(set (v16i8 VECREG:$rT), (SPUshufmask xform_addr:$src))]>; def CHD : RI7Form<0b10101111100, (outs VECREG:$rT), (ins memri7:$src), "chd\t$rT, $src", ShuffleOp, - [(set (v8i16 VECREG:$rT), (SPUvecinsmask dform2_addr:$src))]>; + [(set (v8i16 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>; def CHX : RRForm<0b10101011100, (outs VECREG:$rT), (ins memrr:$src), "chx\t$rT, $src", ShuffleOp, - [(set (v8i16 VECREG:$rT), (SPUvecinsmask xform_addr:$src))]>; + [(set (v8i16 VECREG:$rT), (SPUshufmask xform_addr:$src))]>; def CWD : RI7Form<0b01101111100, (outs VECREG:$rT), (ins memri7:$src), "cwd\t$rT, $src", ShuffleOp, - [(set (v4i32 VECREG:$rT), (SPUvecinsmask dform2_addr:$src))]>; + [(set (v4i32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>; def CWDf32 : RI7Form<0b01101111100, (outs VECREG:$rT), (ins memri7:$src), "cwd\t$rT, $src", ShuffleOp, - [(set (v4f32 VECREG:$rT), (SPUvecinsmask dform2_addr:$src))]>; + [(set (v4f32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>; def CWX : RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src), "cwx\t$rT, $src", ShuffleOp, - [(set (v4i32 VECREG:$rT), (SPUvecinsmask xform_addr:$src))]>; + [(set (v4i32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>; def CWXf32 : RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src), "cwx\t$rT, $src", ShuffleOp, - [(set (v4f32 VECREG:$rT), (SPUvecinsmask xform_addr:$src))]>; + [(set (v4f32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>; def CDD : RI7Form<0b11101111100, (outs VECREG:$rT), (ins memri7:$src), "cdd\t$rT, $src", ShuffleOp, - [(set (v2i64 VECREG:$rT), (SPUvecinsmask dform2_addr:$src))]>; + [(set (v2i64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>; def CDDf64 : RI7Form<0b11101111100, (outs VECREG:$rT), (ins memri7:$src), "cdd\t$rT, $src", ShuffleOp, - [(set (v2f64 VECREG:$rT), (SPUvecinsmask dform2_addr:$src))]>; + [(set (v2f64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>; def CDX : RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src), "cdx\t$rT, $src", ShuffleOp, - [(set (v2i64 VECREG:$rT), (SPUvecinsmask xform_addr:$src))]>; + [(set (v2i64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>; def CDXf64 : RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src), "cdx\t$rT, $src", ShuffleOp, - [(set (v2f64 VECREG:$rT), (SPUvecinsmask xform_addr:$src))]>; + [(set (v2f64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>; //===----------------------------------------------------------------------===// // Constant formation: @@ -1647,14 +1647,23 @@ multiclass SelectBits defm SELB : SelectBits; -class SPUselbPat: +class SPUselbPatVec: Pat<(SPUselb (vectype VECREG:$rA), (vectype VECREG:$rB), (vectype VECREG:$rC)), (inst VECREG:$rA, VECREG:$rB, VECREG:$rC)>; -def : SPUselbPat; -def : SPUselbPat; -def : SPUselbPat; -def : SPUselbPat; +def : SPUselbPatVec; +def : SPUselbPatVec; +def : SPUselbPatVec; +def : SPUselbPatVec; + +class SPUselbPatReg: + Pat<(SPUselb rclass:$rA, rclass:$rB, rclass:$rC), + (inst rclass:$rA, rclass:$rB, rclass:$rC)>; + +def : SPUselbPatReg; +def : SPUselbPatReg; +def : SPUselbPatReg; +def : SPUselbPatReg; class SelectConditional: Pat<(select rclass:$rCond, rclass:$rTrue, rclass:$rFalse), @@ -1811,8 +1820,8 @@ def : SHUFBVecPat1; def : SHUFBVecPat1; // Shuffle mask is a v4i32 vector: +def : SHUFBVecPat1; def : SHUFBVecPat1; -def : SHUFBVecPat1; def : SHUFBVecPat1; def : SHUFBVecPat1; def : SHUFBVecPat1; @@ -1939,7 +1948,9 @@ multiclass ShiftLeftQuadByBits def v16i8: SHLQBIVecInst; def v8i16: SHLQBIVecInst; def v4i32: SHLQBIVecInst; + def v4f32: SHLQBIVecInst; def v2i64: SHLQBIVecInst; + def v2f64: SHLQBIVecInst; } defm SHLQBI : ShiftLeftQuadByBits; @@ -1960,7 +1971,9 @@ multiclass ShiftLeftQuadByBitsImm def v16i8 : SHLQBIIVecInst; def v8i16 : SHLQBIIVecInst; def v4i32 : SHLQBIIVecInst; + def v4f32 : SHLQBIIVecInst; def v2i64 : SHLQBIIVecInst; + def v2f64 : SHLQBIIVecInst; } defm SHLQBII : ShiftLeftQuadByBitsImm; @@ -1982,7 +1995,9 @@ multiclass ShiftLeftQuadBytes def v16i8: SHLQBYVecInst; def v8i16: SHLQBYVecInst; def v4i32: SHLQBYVecInst; + def v4f32: SHLQBYVecInst; def v2i64: SHLQBYVecInst; + def v2f64: SHLQBYVecInst; def r128: SHLQBYInst<(outs GPRC:$rT), (ins GPRC:$rA, R32C:$rB), [(set GPRC:$rT, (SPUshlquad_l_bytes GPRC:$rA, R32C:$rB))]>; } @@ -2003,7 +2018,9 @@ multiclass ShiftLeftQuadBytesImm def v16i8: SHLQBYIVecInst; def v8i16: SHLQBYIVecInst; def v4i32: SHLQBYIVecInst; + def v4f32: SHLQBYIVecInst; def v2i64: SHLQBYIVecInst; + def v2f64: SHLQBYIVecInst; def r128: SHLQBYIInst<(outs GPRC:$rT), (ins GPRC:$rA, u7imm_i32:$val), [(set GPRC:$rT, (SPUshlquad_l_bytes GPRC:$rA, (i32 uimm7:$val)))]>; diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td index 63b852f2542..2c6fc31e635 100644 --- a/lib/Target/CellSPU/SPUNodes.td +++ b/lib/Target/CellSPU/SPUNodes.td @@ -16,7 +16,7 @@ def SDT_SPUCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>; // SPU_GenControl: Type profile for generating control words for insertions def SPU_GenControl : SDTypeProfile<1, 1, []>; -def SPUvecinsmask : SDNode<"SPUISD::INSERT_MASK", SPU_GenControl, []>; +def SPUshufmask : SDNode<"SPUISD::SHUFFLE_MASK", SPU_GenControl, []>; def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPUCallSeq, [SDNPHasChain, SDNPOutFlag]>; diff --git a/test/CodeGen/CellSPU/extract_elt.ll b/test/CodeGen/CellSPU/extract_elt.ll index 6e05686f408..eb5e3a5e1fd 100644 --- a/test/CodeGen/CellSPU/extract_elt.ll +++ b/test/CodeGen/CellSPU/extract_elt.ll @@ -1,10 +1,36 @@ ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s -; RUN: llvm-as -o - %s | llc -march=cellspu -mattr=large_mem > %t2.s -; RUN: grep shufb %t1.s | count 27 -; RUN: grep lqa %t1.s | count 27 -; RUN: grep lqd %t2.s | count 27 -; RUN: grep space %t1.s | count 8 -; RUN: grep byte %t1.s | count 424 +; RUN: grep shufb %t1.s | count 39 +; RUN: grep ilhu %t1.s | count 31 +; RUN: grep iohl %t1.s | count 31 +; RUN: grep lqa %t1.s | count 10 +; RUN: grep shlqbyi %t1.s | count 8 +; RUN: grep selb %t1.s | count 4 +; RUN: grep cgti %t1.s | count 4 +; RUN: grep 515 %t1.s | count 5 +; RUN: grep 1029 %t1.s | count 2 +; RUN: grep 1543 %t1.s | count 2 +; RUN: grep 2057 %t1.s | count 2 +; RUN: grep 2571 %t1.s | count 2 +; RUN: grep 3085 %t1.s | count 2 +; RUN: grep 3599 %t1.s | count 2 +; RUN: grep 32768 %t1.s | count 1 +; RUN: grep 32769 %t1.s | count 1 +; RUN: grep 32770 %t1.s | count 1 +; RUN: grep 32771 %t1.s | count 1 +; RUN: grep 32772 %t1.s | count 1 +; RUN: grep 32773 %t1.s | count 1 +; RUN: grep 32774 %t1.s | count 1 +; RUN: grep 32775 %t1.s | count 1 +; RUN: grep 32776 %t1.s | count 1 +; RUN: grep 32777 %t1.s | count 1 +; RUN: grep 32778 %t1.s | count 1 +; RUN: grep 32779 %t1.s | count 1 +; RUN: grep 32780 %t1.s | count 1 +; RUN: grep 32781 %t1.s | count 1 +; RUN: grep 32782 %t1.s | count 1 +; RUN: grep 32783 %t1.s | count 1 +; RUN: grep 32896 %t1.s | count 24 + target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128" target triple = "spu" @@ -175,3 +201,79 @@ entry: %a = extractelement <16 x i8> %v, i32 15 ret i8 %a } + +;;-------------------------------------------------------------------------- +;; extract element, variable index: +;;-------------------------------------------------------------------------- + +define i8 @extract_varadic_i8(i32 %i) nounwind readnone { +entry: + %0 = extractelement <16 x i8> < i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, i32 %i + ret i8 %0 +} + +define i8 @extract_varadic_i8_1(<16 x i8> %v, i32 %i) nounwind readnone { +entry: + %0 = extractelement <16 x i8> %v, i32 %i + ret i8 %0 +} + +define i16 @extract_varadic_i16(i32 %i) nounwind readnone { +entry: + %0 = extractelement <8 x i16> < i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i32 %i + ret i16 %0 +} + +define i16 @extract_varadic_i16_1(<8 x i16> %v, i32 %i) nounwind readnone { +entry: + %0 = extractelement <8 x i16> %v, i32 %i + ret i16 %0 +} + +define i32 @extract_varadic_i32(i32 %i) nounwind readnone { +entry: + %0 = extractelement <4 x i32> < i32 0, i32 1, i32 2, i32 3>, i32 %i + ret i32 %0 +} + +define i32 @extract_varadic_i32_1(<4 x i32> %v, i32 %i) nounwind readnone { +entry: + %0 = extractelement <4 x i32> %v, i32 %i + ret i32 %0 +} + +define float @extract_varadic_f32(i32 %i) nounwind readnone { +entry: + %0 = extractelement <4 x float> < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >, i32 %i + ret float %0 +} + +define float @extract_varadic_f32_1(<4 x float> %v, i32 %i) nounwind readnone { +entry: + %0 = extractelement <4 x float> %v, i32 %i + ret float %0 +} + +define i64 @extract_varadic_i64(i32 %i) nounwind readnone { +entry: + %0 = extractelement <2 x i64> < i64 0, i64 1>, i32 %i + ret i64 %0 +} + +define i64 @extract_varadic_i64_1(<2 x i64> %v, i32 %i) nounwind readnone { +entry: + %0 = extractelement <2 x i64> %v, i32 %i + ret i64 %0 +} + +define double @extract_varadic_f64(i32 %i) nounwind readnone { +entry: + %0 = extractelement <2 x double> < double 1.000000e+00, double 2.000000e+00>, i32 %i + ret double %0 +} + +define double @extract_varadic_f64_1(<2 x double> %v, i32 %i) nounwind readnone { +entry: + %0 = extractelement <2 x double> %v, i32 %i + ret double %0 +}