From b30e8f6334e9dd04098e834303ae1bd38abe20b0 Mon Sep 17 00:00:00 2001 From: Scott Michel Date: Tue, 2 Dec 2008 19:53:53 +0000 Subject: [PATCH] CellSPU: - Incorporate Tilmann Scheller's ISD::TRUNCATE custom lowering patch - Update SPU calling convention info, even if it's not used yet (but can be at some point or another) - Ensure that any-extended f32 loads are custom lowered, especially when they're promoted for use in printf. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@60438 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/CellSPU/SPUCallingConv.td | 86 +++++++++++++++---- lib/Target/CellSPU/SPUISelLowering.cpp | 110 ++++++++++++++++++++++++- lib/Target/CellSPU/SPUInstrInfo.td | 28 ------- test/CodeGen/CellSPU/trunc.ll | 81 ++++++++++++++++++ 4 files changed, 260 insertions(+), 45 deletions(-) create mode 100644 test/CodeGen/CellSPU/trunc.ll diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td index e62473671ab..4bad19850a0 100644 --- a/lib/Target/CellSPU/SPUCallingConv.td +++ b/lib/Target/CellSPU/SPUCallingConv.td @@ -21,6 +21,8 @@ class CCIfSubtarget // Return-value convention for Cell SPU: Everything can be passed back via $3: def RetCC_SPU : CallingConv<[ + CCIfType<[i8], CCAssignToReg<[R3]>>, + CCIfType<[i16], CCAssignToReg<[R3]>>, CCIfType<[i32], CCAssignToReg<[R3]>>, CCIfType<[i64], CCAssignToReg<[R3]>>, CCIfType<[f32, f64], CCAssignToReg<[R3]>>, @@ -30,30 +32,82 @@ def RetCC_SPU : CallingConv<[ //===----------------------------------------------------------------------===// // CellSPU Argument Calling Conventions -// FIXME +// (note: this isn't used, but presumably should be at some point when other +// targets do.) //===----------------------------------------------------------------------===// /* def CC_SPU : CallingConv<[ - // The first 8 integer arguments are passed in integer registers. - CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, - CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>, + CCIfType<[i8], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, + CCIfType<[i16], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, + CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, + CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, + CCIfType<[i64], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, + CCIfType<[f64], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, + CCIfType<[v16i8, v8i16, v4i32, v4f32, v2i64, v2f64], + CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10, R11, + R12, R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, R29, + R30, R31, R32, R33, R34, R35, R36, R37, R38, + R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, + R57, R58, R59, R60, R61, R62, R63, R64, R65, + R66, R67, R68, R69, R70, R71, R72, R73, R74, + R75, R76, R77, R78, R79]>>, - // SPU can pass back arguments in all - CCIfType<[f32, f64], CCIfSubtarget<"isMachoABI()", - CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>>, - // Other sub-targets pass FP values in F1-10. - CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8, F9,F10]>>, - - // The first 12 Vector arguments are passed in altivec registers. - CCIfType<[v16i8, v8i16, v4i32, v4f32], - CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>> -/* // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, // Vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToStack<16, 16>>*/ + CCAssignToStack<16, 16>> ]>; - */ +*/ diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 222c0d6145a..9913a8bc9eb 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -151,6 +151,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Custom); + // SPU constant load actions are custom lowered: setOperationAction(ISD::Constant, MVT::i64, Custom); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); @@ -277,6 +279,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); + // Custom lower truncates + setOperationAction(ISD::TRUNCATE, MVT::i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::i64, Custom); + // SPU has a legal FP -> signed INT instruction setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); @@ -782,7 +790,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { DAG.setRoot(currentRoot); } #endif - + return result; /*UNREACHED*/ } @@ -2759,6 +2767,102 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(SPUISD::SELB, VT, trueval, falseval, compare); } +//! Custom lower ISD::TRUNCATE +static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) +{ + MVT VT = Op.getValueType(); + MVT::SimpleValueType simpleVT = VT.getSimpleVT(); + MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())); + + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = Op0.getValueType(); + MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits())); + + SDValue PromoteScalar = DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0); + + unsigned maskLow; + unsigned maskHigh; + + // Create shuffle mask + switch (Op0VT.getSimpleVT()) { + case MVT::i128: + switch (simpleVT) { + case MVT::i64: + // least significant doubleword of quadword + maskHigh = 0x08090a0b; + maskLow = 0x0c0d0e0f; + break; + case MVT::i32: + // least significant word of quadword + maskHigh = maskLow = 0x0c0d0e0f; + break; + case MVT::i16: + // least significant halfword of quadword + maskHigh = maskLow = 0x0e0f0e0f; + break; + case MVT::i8: + // least significant byte of quadword + maskHigh = maskLow = 0x0f0f0f0f; + break; + default: + cerr << "Truncation to illegal type!"; + abort(); + } + break; + case MVT::i64: + switch (simpleVT) { + case MVT::i32: + // least significant word of doubleword + maskHigh = maskLow = 0x04050607; + break; + case MVT::i16: + // least significant halfword of doubleword + maskHigh = maskLow = 0x06070607; + break; + case MVT::i8: + // least significant byte of doubleword + maskHigh = maskLow = 0x07070707; + break; + default: + cerr << "Truncation to illegal type!"; + abort(); + } + break; + case MVT::i32: + case MVT::i16: + switch (simpleVT) { + case MVT::i16: + // least significant halfword of word + maskHigh = maskLow = 0x02030203; + break; + case MVT::i8: + // least significant byte of word/halfword + maskHigh = maskLow = 0x03030303; + break; + default: + cerr << "Truncation to illegal type!"; + abort(); + } + break; + default: + cerr << "Trying to lower truncation from illegal type!"; + abort(); + } + + // Use a shuffle to perform the truncation + SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, + DAG.getConstant(maskHigh, MVT::i32), + DAG.getConstant(maskLow, MVT::i32), + DAG.getConstant(maskHigh, MVT::i32), + DAG.getConstant(maskLow, MVT::i32)); + + SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT, + PromoteScalar, PromoteScalar, shufMask); + + return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, + DAG.getNode(ISD::BIT_CONVERT, VecVT, truncShuffle)); +} + //! Custom (target-specific) lowering entry point /*! This is where LLVM's DAG selection process calls to do target-specific @@ -2779,6 +2883,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) abort(); } case ISD::LOAD: + case ISD::EXTLOAD: case ISD::SEXTLOAD: case ISD::ZEXTLOAD: return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl()); @@ -2865,6 +2970,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + + case ISD::TRUNCATE: + return LowerTRUNCATE(Op, DAG); } return SDValue(); diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index e72a1bbe523..9b6df875a42 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -1371,13 +1371,6 @@ multiclass BitwiseOrByteImm defm ORBI : BitwiseOrByteImm; -// Truncate i16 -> i8 -def ORBItrunc : ORBIInst<(outs R8C:$rT), (ins R16C:$rA, u10imm:$val), - [/* empty */]>; - -def : Pat<(trunc R16C:$rSrc), - (ORBItrunc R16C:$rSrc, 0)>; - // OR halfword immediate class ORHIInst pattern>: RI10Form<0b10100000, OOL, IOL, "orhi\t$rT, $rA, $val", @@ -1403,13 +1396,6 @@ multiclass BitwiseOrHalfwordImm defm ORHI : BitwiseOrHalfwordImm; -// Truncate i32 -> i16 -def ORHItrunc : ORHIInst<(outs R16C:$rT), (ins R32C:$rA, u10imm:$val), - [/* empty */]>; - -def : Pat<(trunc R32C:$rSrc), - (ORHItrunc R32C:$rSrc, 0)>; - class ORIInst pattern>: RI10Form<0b00100000, OOL, IOL, "ori\t$rT, $rA, $val", IntegerOp, pattern>; @@ -1444,13 +1430,6 @@ multiclass BitwiseOrImm defm ORI : BitwiseOrImm; -// Truncate i64 -> i32 -def ORItrunc : ORIInst<(outs R32C:$rT), (ins R64C:$rA, u10imm_i32:$val), - [/* empty */]>; - -def : Pat<(trunc R64C:$rSrc), - (ORItrunc R64C:$rSrc, 0)>; - // ORX: "or" across the vector: or's $rA's word slots leaving the result in // $rT[0], slots 1-3 are zeroed. // @@ -2014,13 +1993,6 @@ multiclass ShiftLeftQuadBytesImm defm SHLQBYI : ShiftLeftQuadBytesImm; -// Special form for truncating i64 to i32: -def SHLQBYItrunc64: SHLQBYIInst<(outs R32C:$rT), (ins R64C:$rA, u7imm_i32:$val), - [/* no pattern, see below */]>; - -def : Pat<(trunc R64C:$rSrc), - (SHLQBYItrunc64 R64C:$rSrc, 4)>; - //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // Rotate halfword: //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ diff --git a/test/CodeGen/CellSPU/trunc.ll b/test/CodeGen/CellSPU/trunc.ll new file mode 100644 index 00000000000..845feed8b33 --- /dev/null +++ b/test/CodeGen/CellSPU/trunc.ll @@ -0,0 +1,81 @@ +; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s +; RUN: grep shufb %t1.s | count 9 +; RUN: grep {ilhu.*1799} %t1.s | count 1 +; RUN: grep {ilhu.*771} %t1.s | count 3 +; RUN: grep {ilhu.*1543} %t1.s | count 1 +; RUN: grep {ilhu.*1029} %t1.s | count 1 +; RUN: grep {ilhu.*515} %t1.s | count 1 +; RUN: grep {iohl.*1799} %t1.s | count 1 +; RUN: grep {iohl.*771} %t1.s | count 3 +; RUN: grep {iohl.*1543} %t1.s | count 2 +; RUN: grep {iohl.*515} %t1.s | count 1 +; RUN: grep xsbh %t1.s | count 6 +; RUN: grep sfh %t1.s | count 5 + +; ModuleID = 'trunc.bc' +target datalayout = "E-p:32:32:128-i1:8:128-i8:8:128-i16:16:128-i32:32:128-i64:32:128-f32:32:128-f64:64:128-v64:64:64-v128:128:128-a0:0:128-s0:128:128" +target triple = "spu" + +; codegen for i128 arguments is not implemented yet on CellSPU +; once this changes uncomment the functions below +; and update the expected results accordingly + +;define i8 @trunc_i128_i8(i128 %u) nounwind readnone { +;entry: +; %0 = trunc i128 %u to i8 +; ret i8 %0 +;} +;define i16 @trunc_i128_i16(i128 %u) nounwind readnone { +;entry: +; %0 = trunc i128 %u to i16 +; ret i16 %0 +;} +;define i32 @trunc_i128_i32(i128 %u) nounwind readnone { +;entry: +; %0 = trunc i128 %u to i32 +; ret i32 %0 +;} +;define i64 @trunc_i128_i64(i128 %u) nounwind readnone { +;entry: +; %0 = trunc i128 %u to i64 +; ret i64 %0 +;} + +define i8 @trunc_i64_i8(i64 %u, i8 %v) nounwind readnone { +entry: + %0 = trunc i64 %u to i8 + %1 = sub i8 %0, %v + ret i8 %1 +} +define i16 @trunc_i64_i16(i64 %u, i16 %v) nounwind readnone { +entry: + %0 = trunc i64 %u to i16 + %1 = sub i16 %0, %v + ret i16 %1 +} +define i32 @trunc_i64_i32(i64 %u, i32 %v) nounwind readnone { +entry: + %0 = trunc i64 %u to i32 + %1 = sub i32 %0, %v + ret i32 %1 +} + +define i8 @trunc_i32_i8(i32 %u, i8 %v) nounwind readnone { +entry: + %0 = trunc i32 %u to i8 + %1 = sub i8 %0, %v + ret i8 %1 +} +define i16 @trunc_i32_i16(i32 %u, i16 %v) nounwind readnone { +entry: + %0 = trunc i32 %u to i16 + %1 = sub i16 %0, %v + ret i16 %1 +} + +define i8 @trunc_i16_i8(i16 %u, i8 %v) nounwind readnone { +entry: + %0 = trunc i16 %u to i8 + %1 = sub i8 %0, %v + ret i8 %1 +}