diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp index 2848d5460ee..c9b29158877 100644 --- a/lib/Target/PTX/PTXAsmPrinter.cpp +++ b/lib/Target/PTX/PTXAsmPrinter.cpp @@ -92,6 +92,7 @@ static const char *getRegisterTypeName(unsigned RegNo) { #define TEST_REGCLS(cls, clsstr) \ if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr; TEST_REGCLS(RegPred, pred); + TEST_REGCLS(RegI8, b8); TEST_REGCLS(RegI16, b16); TEST_REGCLS(RegI32, b32); TEST_REGCLS(RegI64, b64); @@ -124,6 +125,7 @@ static const char *getTypeName(const Type* type) { case Type::IntegerTyID: switch (type->getPrimitiveSizeInBits()) { default: llvm_unreachable("Unknown integer bit-width"); + case 8: return ".u8"; case 16: return ".u16"; case 32: return ".u32"; case 64: return ".u64"; diff --git a/lib/Target/PTX/PTXCallingConv.td b/lib/Target/PTX/PTXCallingConv.td index 3e3ff489662..2de619920be 100644 --- a/lib/Target/PTX/PTXCallingConv.td +++ b/lib/Target/PTX/PTXCallingConv.td @@ -15,6 +15,7 @@ // PTX Formal Parameter Calling Convention def CC_PTX : CallingConv<[ CCIfType<[i1], CCAssignToReg<[P12, P13, P14, P15, P16, P17, P18, P19, P20, P21, P22, P23, P24, P25, P26, P27, P28, P29, P30, P31, P32, P33, P34, P35, P36, P37, P38, P39, P40, P41, P42, P43, P44, P45, P46, P47, P48, P49, P50, P51, P52, P53, P54, P55, P56, P57, P58, P59, P60, P61, P62, P63, P64, P65, P66, P67, P68, P69, P70, P71, P72, P73, P74, P75, P76, P77, P78, P79, P80, P81, P82, P83, P84, P85, P86, P87, P88, P89, P90, P91, P92, P93, P94, P95, P96, P97, P98, P99, P100, P101, P102, P103, P104, P105, P106, P107, P108, P109, P110, P111, P112, P113, P114, P115, P116, P117, P118, P119, P120, P121, P122, P123, P124, P125, P126, P127]>>, + CCIfType<[i8], CCAssignToReg<[RQ12, RQ13, RQ14, RQ15, RQ16, RQ17, RQ18, RQ19, RQ20, RQ21, RQ22, RQ23, RQ24, RQ25, RQ26, RQ27, RQ28, RQ29, RQ30, RQ31, RQ32, RQ33, RQ34, RQ35, RQ36, RQ37, RQ38, RQ39, RQ40, RQ41, RQ42, RQ43, RQ44, RQ45, RQ46, RQ47, RQ48, RQ49, RQ50, RQ51, RQ52, RQ53, RQ54, RQ55, RQ56, RQ57, RQ58, RQ59, RQ60, RQ61, RQ62, RQ63, RQ64, RQ65, RQ66, RQ67, RQ68, RQ69, RQ70, RQ71, RQ72, RQ73, RQ74, RQ75, RQ76, RQ77, RQ78, RQ79, RQ80, RQ81, RQ82, RQ83, RQ84, RQ85, RQ86, RQ87, RQ88, RQ89, RQ90, RQ91, RQ92, RQ93, RQ94, RQ95, RQ96, RQ97, RQ98, RQ99, RQ100, RQ101, RQ102, RQ103, RQ104, RQ105, RQ106, RQ107, RQ108, RQ109, RQ110, RQ111, RQ112, RQ113, RQ114, RQ115, RQ116, RQ117, RQ118, RQ119, RQ120, RQ121, RQ122, RQ123, RQ124, RQ125, RQ126, RQ127]>>, CCIfType<[i16], CCAssignToReg<[RH12, RH13, RH14, RH15, RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23, RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31, RH32, RH33, RH34, RH35, RH36, RH37, RH38, RH39, RH40, RH41, RH42, RH43, RH44, RH45, RH46, RH47, RH48, RH49, RH50, RH51, RH52, RH53, RH54, RH55, RH56, RH57, RH58, RH59, RH60, RH61, RH62, RH63, RH64, RH65, RH66, RH67, RH68, RH69, RH70, RH71, RH72, RH73, RH74, RH75, RH76, RH77, RH78, RH79, RH80, RH81, RH82, RH83, RH84, RH85, RH86, RH87, RH88, RH89, RH90, RH91, RH92, RH93, RH94, RH95, RH96, RH97, RH98, RH99, RH100, RH101, RH102, RH103, RH104, RH105, RH106, RH107, RH108, RH109, RH110, RH111, RH112, RH113, RH114, RH115, RH116, RH117, RH118, RH119, RH120, RH121, RH122, RH123, RH124, RH125, RH126, RH127]>>, CCIfType<[i32,f32], CCAssignToReg<[R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127]>>, CCIfType<[i64,f64], CCAssignToReg<[RD12, RD13, RD14, RD15, RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23, RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31, RD32, RD33, RD34, RD35, RD36, RD37, RD38, RD39, RD40, RD41, RD42, RD43, RD44, RD45, RD46, RD47, RD48, RD49, RD50, RD51, RD52, RD53, RD54, RD55, RD56, RD57, RD58, RD59, RD60, RD61, RD62, RD63, RD64, RD65, RD66, RD67, RD68, RD69, RD70, RD71, RD72, RD73, RD74, RD75, RD76, RD77, RD78, RD79, RD80, RD81, RD82, RD83, RD84, RD85, RD86, RD87, RD88, RD89, RD90, RD91, RD92, RD93, RD94, RD95, RD96, RD97, RD98, RD99, RD100, RD101, RD102, RD103, RD104, RD105, RD106, RD107, RD108, RD109, RD110, RD111, RD112, RD113, RD114, RD115, RD116, RD117, RD118, RD119, RD120, RD121, RD122, RD123, RD124, RD125, RD126, RD127]>> @@ -23,6 +24,7 @@ def CC_PTX : CallingConv<[ // PTX Return Value Calling Convention def RetCC_PTX : CallingConv<[ CCIfType<[i1], CCAssignToReg<[P0, P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11]>>, + CCIfType<[i8], CCAssignToReg<[RQ0, RQ1, RQ2, RQ3, RQ4, RQ5, RQ6, RQ7, RQ8, RQ9, RQ10, RQ11]>>, CCIfType<[i16], CCAssignToReg<[RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7, RH8, RH9, RH10, RH11]>>, CCIfType<[i32,f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11]>>, CCIfType<[i64,f64], CCAssignToReg<[RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7, RD8, RD9, RD10, RD11]>> diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp index 6b7954d4e9d..9b836a52a39 100644 --- a/lib/Target/PTX/PTXISelLowering.cpp +++ b/lib/Target/PTX/PTXISelLowering.cpp @@ -40,6 +40,7 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. addRegisterClass(MVT::i1, PTX::RegPredRegisterClass); + addRegisterClass(MVT::i8, PTX::RegI8RegisterClass); addRegisterClass(MVT::i16, PTX::RegI16RegisterClass); addRegisterClass(MVT::i32, PTX::RegI32RegisterClass); addRegisterClass(MVT::i64, PTX::RegI64RegisterClass); @@ -52,10 +53,20 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) setOperationAction(ISD::ConstantFP, MVT::f32, Legal); setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - + + // Promote i1 type + setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + + setTruncStoreAction(MVT::i8, MVT::i1, Promote); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + // Turn i16 (z)extload into load + (z)extend setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand); setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand); // Turn f32 extload into load + fextend setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); @@ -176,6 +187,7 @@ struct argmap_entry { bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; } } argmap[] = { argmap_entry(MVT::i1, PTX::RegPredRegisterClass), + argmap_entry(MVT::i8, PTX::RegI8RegisterClass), argmap_entry(MVT::i16, PTX::RegI16RegisterClass), argmap_entry(MVT::i32, PTX::RegI32RegisterClass), argmap_entry(MVT::i64, PTX::RegI64RegisterClass), @@ -252,6 +264,9 @@ SDValue PTXTargetLowering:: if (RegVT == MVT::i1) { TRC = PTX::RegPredRegisterClass; } + else if (RegVT == MVT::i8) { + TRC = PTX::RegI8RegisterClass; + } else if (RegVT == MVT::i16) { TRC = PTX::RegI16RegisterClass; } diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp index 5bdac8917e4..720d5b15fe7 100644 --- a/lib/Target/PTX/PTXInstrInfo.cpp +++ b/lib/Target/PTX/PTXInstrInfo.cpp @@ -33,6 +33,7 @@ static const struct map_entry { const TargetRegisterClass *cls; const int opcode; } map[] = { + { &PTX::RegI8RegClass, PTX::MOVU8rr }, { &PTX::RegI16RegClass, PTX::MOVU16rr }, { &PTX::RegI32RegClass, PTX::MOVU32rr }, { &PTX::RegI64RegClass, PTX::MOVU64rr }, @@ -302,7 +303,9 @@ void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, int OpCode; // Select the appropriate opcode based on the register class - if (RC == PTX::RegI16RegisterClass) { + if (RC == PTX::RegI8RegisterClass) { + OpCode = PTX::STACKSTOREI8; + } else if (RC == PTX::RegI16RegisterClass) { OpCode = PTX::STACKSTOREI16; } else if (RC == PTX::RegI32RegisterClass) { OpCode = PTX::STACKSTOREI32; @@ -337,7 +340,9 @@ void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, int OpCode; // Select the appropriate opcode based on the register class - if (RC == PTX::RegI16RegisterClass) { + if (RC == PTX::RegI8RegisterClass) { + OpCode = PTX::STACKLOADI8; + } else if (RC == PTX::RegI16RegisterClass) { OpCode = PTX::STACKLOADI16; } else if (RC == PTX::RegI32RegisterClass) { OpCode = PTX::STACKLOADI32; diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td index a6c03e54ae6..c5cedb0a8bb 100644 --- a/lib/Target/PTX/PTXInstrInfo.td +++ b/lib/Target/PTX/PTXInstrInfo.td @@ -537,6 +537,7 @@ multiclass PTX_LD { + defm u8 : PTX_LD; defm u16 : PTX_LD; defm u32 : PTX_LD; defm u64 : PTX_LD; @@ -572,6 +573,7 @@ multiclass PTX_ST { + defm u8 : PTX_ST; defm u16 : PTX_ST; defm u32 : PTX_ST; defm u64 : PTX_ST; @@ -783,22 +785,27 @@ defm XOR : PTX_LOGIC<"xor", xor>; let neverHasSideEffects = 1 in { def MOVPREDrr : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>; + def MOVU8rr + : InstPTX<(outs RegI8:$d), (ins RegI8:$a), "mov.u8\t$d, $a", []>; def MOVU16rr - : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>; + : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>; def MOVU32rr - : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>; + : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>; def MOVU64rr - : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>; + : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>; def MOVF32rr - : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>; + : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>; def MOVF64rr - : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>; + : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MOVPREDri : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a", [(set RegPred:$d, imm:$a)]>; + def MOVU8ri + : InstPTX<(outs RegI8:$d), (ins i8imm:$a), "mov.u8\t$d, $a", + [(set RegI8:$d, imm:$a)]>; def MOVU16ri : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a", [(set RegI16:$d, imm:$a)]>; @@ -838,6 +845,9 @@ let hasSideEffects = 1 in { def LDpiPred : InstPTX<(outs RegPred:$d), (ins MEMpi:$a), "ld.param.pred\t$d, [$a]", [(set RegPred:$d, (PTXloadparam timm:$a))]>; + def LDpiU8 : InstPTX<(outs RegI8:$d), (ins MEMpi:$a), + "ld.param.u8\t$d, [$a]", + [(set RegI8:$d, (PTXloadparam timm:$a))]>; def LDpiU16 : InstPTX<(outs RegI16:$d), (ins MEMpi:$a), "ld.param.u16\t$d, [$a]", [(set RegI16:$d, (PTXloadparam timm:$a))]>; @@ -857,6 +867,9 @@ let hasSideEffects = 1 in { def STpiPred : InstPTX<(outs), (ins MEMret:$d, RegPred:$a), "st.param.pred\t[$d], $a", [(PTXstoreparam timm:$d, RegPred:$a)]>; + def STpiU8 : InstPTX<(outs), (ins MEMret:$d, RegI8:$a), + "st.param.u8\t[$d], $a", + [(PTXstoreparam timm:$d, RegI8:$a)]>; def STpiU16 : InstPTX<(outs), (ins MEMret:$d, RegI16:$a), "st.param.u16\t[$d], $a", [(PTXstoreparam timm:$d, RegI16:$a)]>; @@ -887,6 +900,10 @@ defm STs : PTX_ST_ALL<"st.shared", store_shared>; // PTX does not directly support converting to a predicate type, so we fake it // by performing a greater-than test between the value and zero. This follows // the C convention that any non-zero value is equivalent to 'true'. +def CVT_pred_u8 + : InstPTX<(outs RegPred:$d), (ins RegI8:$a), "setp.gt.b8\t$d, $a, 0", + [(set RegPred:$d, (trunc RegI8:$a))]>; + def CVT_pred_u16 : InstPTX<(outs RegPred:$d), (ins RegI16:$a), "setp.gt.b16\t$d, $a, 0", [(set RegPred:$d, (trunc RegI16:$a))]>; @@ -907,6 +924,34 @@ def CVT_pred_f64 : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "setp.gt.b64\t$d, $a, 0", [(set RegPred:$d, (fp_to_uint RegF64:$a))]>; +// Conversion to u8 +// PTX does not directly support converting a predicate to a value, so we +// use a select instruction to select either 0 or 1 (integer or fp) based +// on the truth value of the predicate. +def CVT_u8_pred + : InstPTX<(outs RegI8:$d), (ins RegPred:$a), "selp.u8\t$d, 1, 0, $a", + [(set RegI8:$d, (zext RegPred:$a))]>; + +def CVT_u8_preds + : InstPTX<(outs RegI8:$d), (ins RegPred:$a), "selp.u8\t$d, 1, 0, $a", + [(set RegI8:$d, (sext RegPred:$a))]>; + +def CVT_u8_u32 + : InstPTX<(outs RegI8:$d), (ins RegI32:$a), "cvt.u8.u32\t$d, $a", + [(set RegI8:$d, (trunc RegI32:$a))]>; + +def CVT_u8_u64 + : InstPTX<(outs RegI8:$d), (ins RegI64:$a), "cvt.u8.u64\t$d, $a", + [(set RegI8:$d, (trunc RegI64:$a))]>; + +def CVT_u8_f32 + : InstPTX<(outs RegI8:$d), (ins RegF32:$a), "cvt.rzi.u8.f32\t$d, $a", + [(set RegI8:$d, (fp_to_uint RegF32:$a))]>; + +def CVT_u8_f64 + : InstPTX<(outs RegI8:$d), (ins RegF64:$a), "cvt.rzi.u8.f64\t$d, $a", + [(set RegI8:$d, (fp_to_uint RegF64:$a))]>; + // Conversion to u16 // PTX does not directly support converting a predicate to a value, so we // use a select instruction to select either 0 or 1 (integer or fp) based @@ -915,6 +960,18 @@ def CVT_u16_pred : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a", [(set RegI16:$d, (zext RegPred:$a))]>; +def CVT_u16_preds + : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a", + [(set RegI16:$d, (sext RegPred:$a))]>; + +def CVT_u16_u8 + : InstPTX<(outs RegI16:$d), (ins RegI8:$a), "cvt.u16.u8\t$d, $a", + [(set RegI16:$d, (zext RegI8:$a))]>; + +def CVT_u16_s8 + : InstPTX<(outs RegI16:$d), (ins RegI8:$a), "cvt.u16.s8\t$d, $a", + [(set RegI16:$d, (sext RegI8:$a))]>; + def CVT_u16_u32 : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", [(set RegI16:$d, (trunc RegI32:$a))]>; @@ -937,10 +994,26 @@ def CVT_u32_pred : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "selp.u32\t$d, 1, 0, $a", [(set RegI32:$d, (zext RegPred:$a))]>; +def CVT_u32_u8 + : InstPTX<(outs RegI32:$d), (ins RegI8:$a), "cvt.u32.u8\t$d, $a", + [(set RegI32:$d, (zext RegI8:$a))]>; + def CVT_u32_u16 : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", [(set RegI32:$d, (zext RegI16:$a))]>; +def CVT_u32_preds + : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "selp.u32\t$d, 1, 0, $a", + [(set RegI32:$d, (sext RegPred:$a))]>; + +def CVT_u32_s8 + : InstPTX<(outs RegI32:$d), (ins RegI8:$a), "cvt.u32.s8\t$d, $a", + [(set RegI32:$d, (zext RegI8:$a))]>; + +def CVT_u32_s16 + : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.s16\t$d, $a", + [(set RegI32:$d, (sext RegI16:$a))]>; + def CVT_u32_u64 : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", [(set RegI32:$d, (trunc RegI64:$a))]>; @@ -959,6 +1032,10 @@ def CVT_u64_pred : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "selp.u64\t$d, 1, 0, $a", [(set RegI64:$d, (zext RegPred:$a))]>; +def CVT_u64_u8 + : InstPTX<(outs RegI64:$d), (ins RegI8:$a), "cvt.u64.u8\t$d, $a", + [(set RegI64:$d, (zext RegI8:$a))]>; + def CVT_u64_u16 : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", [(set RegI64:$d, (zext RegI16:$a))]>; @@ -967,6 +1044,22 @@ def CVT_u64_u32 : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", [(set RegI64:$d, (zext RegI32:$a))]>; +def CVT_u64_preds + : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "selp.u64\t$d, 1, 0, $a", + [(set RegI64:$d, (sext RegPred:$a))]>; + +def CVT_u64_s8 + : InstPTX<(outs RegI64:$d), (ins RegI8:$a), "cvt.u64.s8\t$d, $a", + [(set RegI64:$d, (zext RegI8:$a))]>; + +def CVT_u64_s16 + : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.s16\t$d, $a", + [(set RegI64:$d, (sext RegI16:$a))]>; + +def CVT_u64_s32 + : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.s32\t$d, $a", + [(set RegI64:$d, (sext RegI32:$a))]>; + def CVT_u64_f32 : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rzi.u64.f32\t$d, $a", [(set RegI64:$d, (fp_to_uint RegF32:$a))]>; @@ -982,6 +1075,10 @@ def CVT_f32_pred "selp.f32\t$d, 0F3F800000, 0F00000000, $a", // 1.0 [(set RegF32:$d, (uint_to_fp RegPred:$a))]>; +def CVT_f32_u8 + : InstPTX<(outs RegF32:$d), (ins RegI8:$a), "cvt.rn.f32.u8\t$d, $a", + [(set RegF32:$d, (uint_to_fp RegI8:$a))]>; + def CVT_f32_u16 : InstPTX<(outs RegF32:$d), (ins RegI16:$a), "cvt.rn.f32.u16\t$d, $a", [(set RegF32:$d, (uint_to_fp RegI16:$a))]>; @@ -1005,6 +1102,10 @@ def CVT_f64_pred "selp.f64\t$d, 0D3F80000000000000, 0D0000000000000000, $a", // 1.0 [(set RegF64:$d, (uint_to_fp RegPred:$a))]>; +def CVT_f64_u8 + : InstPTX<(outs RegF64:$d), (ins RegI8:$a), "cvt.rn.f64.u8\t$d, $a", + [(set RegF64:$d, (uint_to_fp RegI8:$a))]>; + def CVT_f64_u16 : InstPTX<(outs RegF64:$d), (ins RegI16:$a), "cvt.rn.f64.u16\t$d, $a", [(set RegF64:$d, (uint_to_fp RegI16:$a))]>; @@ -1043,6 +1144,8 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in { ///===- Spill Instructions ------------------------------------------------===// // Special instructions used for stack spilling +def STACKSTOREI8 : InstPTX<(outs), (ins i32imm:$d, RegI8:$a), + "mov.u8\ts$d, $a", []>; def STACKSTOREI16 : InstPTX<(outs), (ins i32imm:$d, RegI16:$a), "mov.u16\ts$d, $a", []>; def STACKSTOREI32 : InstPTX<(outs), (ins i32imm:$d, RegI32:$a), @@ -1054,6 +1157,8 @@ def STACKSTOREF32 : InstPTX<(outs), (ins i32imm:$d, RegF32:$a), def STACKSTOREF64 : InstPTX<(outs), (ins i32imm:$d, RegF64:$a), "mov.f64\ts$d, $a", []>; +def STACKLOADI8 : InstPTX<(outs), (ins RegI8:$d, i32imm:$a), + "mov.u8\t$d, s$a", []>; def STACKLOADI16 : InstPTX<(outs), (ins RegI16:$d, i32imm:$a), "mov.u16\t$d, s$a", []>; def STACKLOADI32 : InstPTX<(outs), (ins RegI32:$d, i32imm:$a), diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td index 1313d248325..3e7fe5687a8 100644 --- a/lib/Target/PTX/PTXRegisterInfo.td +++ b/lib/Target/PTX/PTXRegisterInfo.td @@ -151,6 +151,137 @@ def P125 : PTXReg<"p125">; def P126 : PTXReg<"p126">; def P127 : PTXReg<"p127">; +///===- 8-Bit Registers --------------------------------------------------===// + +def RQ0 : PTXReg<"rq0">; +def RQ1 : PTXReg<"rq1">; +def RQ2 : PTXReg<"rq2">; +def RQ3 : PTXReg<"rq3">; +def RQ4 : PTXReg<"rq4">; +def RQ5 : PTXReg<"rq5">; +def RQ6 : PTXReg<"rq6">; +def RQ7 : PTXReg<"rq7">; +def RQ8 : PTXReg<"rq8">; +def RQ9 : PTXReg<"rq9">; +def RQ10 : PTXReg<"rq10">; +def RQ11 : PTXReg<"rq11">; +def RQ12 : PTXReg<"rq12">; +def RQ13 : PTXReg<"rq13">; +def RQ14 : PTXReg<"rq14">; +def RQ15 : PTXReg<"rq15">; +def RQ16 : PTXReg<"rq16">; +def RQ17 : PTXReg<"rq17">; +def RQ18 : PTXReg<"rq18">; +def RQ19 : PTXReg<"rq19">; +def RQ20 : PTXReg<"rq20">; +def RQ21 : PTXReg<"rq21">; +def RQ22 : PTXReg<"rq22">; +def RQ23 : PTXReg<"rq23">; +def RQ24 : PTXReg<"rq24">; +def RQ25 : PTXReg<"rq25">; +def RQ26 : PTXReg<"rq26">; +def RQ27 : PTXReg<"rq27">; +def RQ28 : PTXReg<"rq28">; +def RQ29 : PTXReg<"rq29">; +def RQ30 : PTXReg<"rq30">; +def RQ31 : PTXReg<"rq31">; +def RQ32 : PTXReg<"rq32">; +def RQ33 : PTXReg<"rq33">; +def RQ34 : PTXReg<"rq34">; +def RQ35 : PTXReg<"rq35">; +def RQ36 : PTXReg<"rq36">; +def RQ37 : PTXReg<"rq37">; +def RQ38 : PTXReg<"rq38">; +def RQ39 : PTXReg<"rq39">; +def RQ40 : PTXReg<"rq40">; +def RQ41 : PTXReg<"rq41">; +def RQ42 : PTXReg<"rq42">; +def RQ43 : PTXReg<"rq43">; +def RQ44 : PTXReg<"rq44">; +def RQ45 : PTXReg<"rq45">; +def RQ46 : PTXReg<"rq46">; +def RQ47 : PTXReg<"rq47">; +def RQ48 : PTXReg<"rq48">; +def RQ49 : PTXReg<"rq49">; +def RQ50 : PTXReg<"rq50">; +def RQ51 : PTXReg<"rq51">; +def RQ52 : PTXReg<"rq52">; +def RQ53 : PTXReg<"rq53">; +def RQ54 : PTXReg<"rq54">; +def RQ55 : PTXReg<"rq55">; +def RQ56 : PTXReg<"rq56">; +def RQ57 : PTXReg<"rq57">; +def RQ58 : PTXReg<"rq58">; +def RQ59 : PTXReg<"rq59">; +def RQ60 : PTXReg<"rq60">; +def RQ61 : PTXReg<"rq61">; +def RQ62 : PTXReg<"rq62">; +def RQ63 : PTXReg<"rq63">; +def RQ64 : PTXReg<"rq64">; +def RQ65 : PTXReg<"rq65">; +def RQ66 : PTXReg<"rq66">; +def RQ67 : PTXReg<"rq67">; +def RQ68 : PTXReg<"rq68">; +def RQ69 : PTXReg<"rq69">; +def RQ70 : PTXReg<"rq70">; +def RQ71 : PTXReg<"rq71">; +def RQ72 : PTXReg<"rq72">; +def RQ73 : PTXReg<"rq73">; +def RQ74 : PTXReg<"rq74">; +def RQ75 : PTXReg<"rq75">; +def RQ76 : PTXReg<"rq76">; +def RQ77 : PTXReg<"rq77">; +def RQ78 : PTXReg<"rq78">; +def RQ79 : PTXReg<"rq79">; +def RQ80 : PTXReg<"rq80">; +def RQ81 : PTXReg<"rq81">; +def RQ82 : PTXReg<"rq82">; +def RQ83 : PTXReg<"rq83">; +def RQ84 : PTXReg<"rq84">; +def RQ85 : PTXReg<"rq85">; +def RQ86 : PTXReg<"rq86">; +def RQ87 : PTXReg<"rq87">; +def RQ88 : PTXReg<"rq88">; +def RQ89 : PTXReg<"rq89">; +def RQ90 : PTXReg<"rq90">; +def RQ91 : PTXReg<"rq91">; +def RQ92 : PTXReg<"rq92">; +def RQ93 : PTXReg<"rq93">; +def RQ94 : PTXReg<"rq94">; +def RQ95 : PTXReg<"rq95">; +def RQ96 : PTXReg<"rq96">; +def RQ97 : PTXReg<"rq97">; +def RQ98 : PTXReg<"rq98">; +def RQ99 : PTXReg<"rq99">; +def RQ100 : PTXReg<"rq100">; +def RQ101 : PTXReg<"rq101">; +def RQ102 : PTXReg<"rq102">; +def RQ103 : PTXReg<"rq103">; +def RQ104 : PTXReg<"rq104">; +def RQ105 : PTXReg<"rq105">; +def RQ106 : PTXReg<"rq106">; +def RQ107 : PTXReg<"rq107">; +def RQ108 : PTXReg<"rq108">; +def RQ109 : PTXReg<"rq109">; +def RQ110 : PTXReg<"rq110">; +def RQ111 : PTXReg<"rq111">; +def RQ112 : PTXReg<"rq112">; +def RQ113 : PTXReg<"rq113">; +def RQ114 : PTXReg<"rq114">; +def RQ115 : PTXReg<"rq115">; +def RQ116 : PTXReg<"rq116">; +def RQ117 : PTXReg<"rq117">; +def RQ118 : PTXReg<"rq118">; +def RQ119 : PTXReg<"rq119">; +def RQ120 : PTXReg<"rq120">; +def RQ121 : PTXReg<"rq121">; +def RQ122 : PTXReg<"rq122">; +def RQ123 : PTXReg<"rq123">; +def RQ124 : PTXReg<"rq124">; +def RQ125 : PTXReg<"rq125">; +def RQ126 : PTXReg<"rq126">; +def RQ127 : PTXReg<"rq127">; + ///===- 16-Bit Registers --------------------------------------------------===// def RH0 : PTXReg<"rh0">; @@ -548,6 +679,7 @@ def RD127 : PTXReg<"rd127">; // Register classes //===----------------------------------------------------------------------===// def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%u", 0, 127)>; +def RegI8 : RegisterClass<"PTX", [i8], 8, (sequence "RQ%u", 0, 127)>; def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%u", 0, 127)>; def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%u", 0, 127)>; def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%u", 0, 127)>; diff --git a/lib/Target/PTX/generate-register-td.py b/lib/Target/PTX/generate-register-td.py index 15286908961..240239158c1 100755 --- a/lib/Target/PTX/generate-register-td.py +++ b/lib/Target/PTX/generate-register-td.py @@ -15,15 +15,16 @@ from sys import argv, exit, stdout -if len(argv) != 5: - print('Usage: generate-register-td.py ') +if len(argv) != 6: + print('Usage: generate-register-td.py ') exit(1) try: num_pred = int(argv[1]) - num_16bit = int(argv[2]) - num_32bit = int(argv[3]) - num_64bit = int(argv[4]) + num_8bit = int(argv[2]) + num_16bit = int(argv[3]) + num_32bit = int(argv[4]) + num_64bit = int(argv[5]) except: print('ERROR: Invalid integer parameter') exit(1) @@ -60,6 +61,11 @@ td_file.write('\n///===- Predicate Registers ----------------------------------- for r in range(0, num_pred): td_file.write('def P%d : PTXReg<"p%d">;\n' % (r, r)) +# Print 8-bit registers +td_file.write('\n///===- 8-Bit Registers --------------------------------------------------===//\n\n') +for r in range(0, num_8bit): + td_file.write('def RQ%d : PTXReg<"rq%d">;\n' % (r, r)) + # Print 16-bit registers td_file.write('\n///===- 16-Bit Registers --------------------------------------------------===//\n\n') for r in range(0, num_16bit): @@ -86,6 +92,7 @@ td_file.write(''' # Print register classes td_file.write('def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%%u", 0, %d)>;\n' % (num_pred-1)) +td_file.write('def RegI8 : RegisterClass<"PTX", [i8], 8, (sequence "RQ%%u", 0, %d)>;\n' % (num_8bit-1)) td_file.write('def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%%u", 0, %d)>;\n' % (num_16bit-1)) td_file.write('def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%%u", 0, %d)>;\n' % (num_32bit-1)) td_file.write('def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%%u", 0, %d)>;\n' % (num_64bit-1)) @@ -101,16 +108,20 @@ td_file = open('PTXCallingConv.td', 'w') # Reserve 10% of the available registers for return values, and the other 90% # for parameters num_ret_pred = int(0.1 * num_pred) +num_ret_8bit = int(0.1 * num_8bit) num_ret_16bit = int(0.1 * num_16bit) num_ret_32bit = int(0.1 * num_32bit) num_ret_64bit = int(0.1 * num_64bit) num_param_pred = num_pred - num_ret_pred +num_param_8bit = num_8bit - num_ret_8bit num_param_16bit = num_16bit - num_ret_16bit num_param_32bit = num_32bit - num_ret_32bit num_param_64bit = num_64bit - num_ret_64bit param_regs_pred = [('P%d' % (i+num_ret_pred)) for i in range(0, num_param_pred)] ret_regs_pred = ['P%d' % i for i in range(0, num_ret_pred)] +param_regs_8bit = [('RQ%d' % (i+num_ret_8bit)) for i in range(0, num_param_8bit)] +ret_regs_8bit = ['RQ%d' % i for i in range(0, num_ret_8bit)] param_regs_16bit = [('RH%d' % (i+num_ret_16bit)) for i in range(0, num_param_16bit)] ret_regs_16bit = ['RH%d' % i for i in range(0, num_ret_16bit)] param_regs_32bit = [('R%d' % (i+num_ret_32bit)) for i in range(0, num_param_32bit)] @@ -120,6 +131,8 @@ ret_regs_64bit = ['RD%d' % i for i in range(0, num_ret_64bit)] param_list_pred = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_pred) ret_list_pred = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_pred) +param_list_8bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_8bit) +ret_list_8bit = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_8bit) param_list_16bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_16bit) ret_list_16bit = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_16bit) param_list_32bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_32bit) @@ -144,6 +157,7 @@ td_file.write(''' // PTX Formal Parameter Calling Convention def CC_PTX : CallingConv<[ CCIfType<[i1], CCAssignToReg<[%s]>>, + CCIfType<[i8], CCAssignToReg<[%s]>>, CCIfType<[i16], CCAssignToReg<[%s]>>, CCIfType<[i32,f32], CCAssignToReg<[%s]>>, CCIfType<[i64,f64], CCAssignToReg<[%s]>> @@ -152,12 +166,13 @@ def CC_PTX : CallingConv<[ // PTX Return Value Calling Convention def RetCC_PTX : CallingConv<[ CCIfType<[i1], CCAssignToReg<[%s]>>, + CCIfType<[i8], CCAssignToReg<[%s]>>, CCIfType<[i16], CCAssignToReg<[%s]>>, CCIfType<[i32,f32], CCAssignToReg<[%s]>>, CCIfType<[i64,f64], CCAssignToReg<[%s]>> ]>; -''' % (param_list_pred, param_list_16bit, param_list_32bit, param_list_64bit, - ret_list_pred, ret_list_16bit, ret_list_32bit, ret_list_64bit)) +''' % (param_list_pred, param_list_8bit, param_list_16bit, param_list_32bit, param_list_64bit, + ret_list_pred, ret_list_8bit, ret_list_16bit, ret_list_32bit, ret_list_64bit)) td_file.close() diff --git a/test/CodeGen/PTX/cvt.ll b/test/CodeGen/PTX/cvt.ll index 18f7ef365b4..dbabbf8938e 100644 --- a/test/CodeGen/PTX/cvt.ll +++ b/test/CodeGen/PTX/cvt.ll @@ -3,6 +3,17 @@ ; preds ; (note: we convert back to i32 to return) +define ptx_device i32 @cvt_pred_i8(i8 %x, i1 %y) { +; CHECK: setp.gt.b8 p[[P0:[0-9]+]], rq{{[0-9]+}}, 0 +; CHECK-NEXT: and.pred p0, p[[P0:[0-9]+]], p{{[0-9]+}}; +; CHECK-NEXT: selp.u32 r{{[0-9]+}}, 1, 0, p[[P0:[0-9]+]]; +; CHECK-NEXT: ret; + %a = trunc i8 %x to i1 + %b = and i1 %a, %y + %c = zext i1 %b to i32 + ret i32 %c +} + define ptx_device i32 @cvt_pred_i16(i16 %x, i1 %y) { ; CHECK: setp.gt.b16 p[[P0:[0-9]+]], rh{{[0-9]+}}, 0 ; CHECK-NEXT: and.pred p0, p[[P0:[0-9]+]], p{{[0-9]+}}; @@ -58,6 +69,43 @@ define ptx_device i32 @cvt_pred_f64(double %x, i1 %y) { ret i32 %c } +; i8 + +define ptx_device i8 @cvt_i8_preds(i1 %x) { +; CHECK: selp.u8 rq{{[0-9]+}}, 1, 0, p{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = zext i1 %x to i8 + ret i8 %a +} + +define ptx_device i8 @cvt_i8_i32(i32 %x) { +; CHECK: cvt.u8.u32 rq{{[0-9]+}}, r{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = trunc i32 %x to i8 + ret i8 %a +} + +define ptx_device i8 @cvt_i8_i64(i64 %x) { +; CHECK: cvt.u8.u64 rq{{[0-9]+}}, rd{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = trunc i64 %x to i8 + ret i8 %a +} + +define ptx_device i8 @cvt_i8_f32(float %x) { +; CHECK: cvt.rzi.u8.f32 rq{{[0-9]+}}, r{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = fptoui float %x to i8 + ret i8 %a +} + +define ptx_device i8 @cvt_i8_f64(double %x) { +; CHECK: cvt.rzi.u8.f64 rq{{[0-9]+}}, rd{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = fptoui double %x to i8 + ret i8 %a +} + ; i16 define ptx_device i16 @cvt_i16_preds(i1 %x) { @@ -67,6 +115,13 @@ define ptx_device i16 @cvt_i16_preds(i1 %x) { ret i16 %a } +define ptx_device i16 @cvt_i16_i8(i8 %x) { +; CHECK: cvt.u16.u8 rh{{[0-9]+}}, rq{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = zext i8 %x to i16 + ret i16 %a +} + define ptx_device i16 @cvt_i16_i32(i32 %x) { ; CHECK: cvt.u16.u32 rh{{[0-9]+}}, r{{[0-9]+}}; ; CHECK-NEXT: ret; @@ -104,6 +159,13 @@ define ptx_device i32 @cvt_i32_preds(i1 %x) { ret i32 %a } +define ptx_device i32 @cvt_i32_i8(i8 %x) { +; CHECK: cvt.u32.u8 r{{[0-9]+}}, rq{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = zext i8 %x to i32 + ret i32 %a +} + define ptx_device i32 @cvt_i32_i16(i16 %x) { ; CHECK: cvt.u32.u16 r{{[0-9]+}}, rh{{[0-9]+}}; ; CHECK-NEXT: ret; @@ -141,6 +203,13 @@ define ptx_device i64 @cvt_i64_preds(i1 %x) { ret i64 %a } +define ptx_device i64 @cvt_i64_i8(i8 %x) { +; CHECK: cvt.u64.u8 rd{{[0-9]+}}, rq{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = zext i8 %x to i64 + ret i64 %a +} + define ptx_device i64 @cvt_i64_i16(i16 %x) { ; CHECK: cvt.u64.u16 rd{{[0-9]+}}, rh{{[0-9]+}}; ; CHECK-NEXT: ret; @@ -178,6 +247,13 @@ define ptx_device float @cvt_f32_preds(i1 %x) { ret float %a } +define ptx_device float @cvt_f32_i8(i8 %x) { +; CHECK: cvt.rn.f32.u8 r{{[0-9]+}}, rq{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = uitofp i8 %x to float + ret float %a +} + define ptx_device float @cvt_f32_i16(i16 %x) { ; CHECK: cvt.rn.f32.u16 r{{[0-9]+}}, rh{{[0-9]+}}; ; CHECK-NEXT: ret; @@ -215,6 +291,13 @@ define ptx_device double @cvt_f64_preds(i1 %x) { ret double %a } +define ptx_device double @cvt_f64_i8(i8 %x) { +; CHECK: cvt.rn.f64.u8 rd{{[0-9]+}}, rq{{[0-9]+}}; +; CHECK-NEXT: ret; + %a = uitofp i8 %x to double + ret double %a +} + define ptx_device double @cvt_f64_i16(i16 %x) { ; CHECK: cvt.rn.f64.u16 rd{{[0-9]+}}, rh{{[0-9]+}}; ; CHECK-NEXT: ret; diff --git a/test/CodeGen/PTX/ld.ll b/test/CodeGen/PTX/ld.ll index d184d1243ab..951b14b8644 100644 --- a/test/CodeGen/PTX/ld.ll +++ b/test/CodeGen/PTX/ld.ll @@ -1,5 +1,17 @@ ; RUN: llc < %s -march=ptx32 | FileCheck %s +;CHECK: .extern .global .b8 array_i8[10]; +@array_i8 = external global [10 x i8] + +;CHECK: .extern .const .b8 array_constant_i8[10]; +@array_constant_i8 = external addrspace(1) constant [10 x i8] + +;CHECK: .extern .local .b8 array_local_i8[10]; +@array_local_i8 = external addrspace(2) global [10 x i8] + +;CHECK: .extern .shared .b8 array_shared_i8[10]; +@array_shared_i8 = external addrspace(4) global [10 x i8] + ;CHECK: .extern .global .b8 array_i16[20]; @array_i16 = external global [10 x i16] @@ -60,6 +72,13 @@ ;CHECK: .extern .shared .b8 array_shared_double[80]; @array_shared_double = external addrspace(4) global [10 x double] +define ptx_device i8 @t1_u8(i8* %p) { +entry: +;CHECK: ld.global.u8 rq{{[0-9]+}}, [r{{[0-9]+}}]; +;CHECK-NEXT: ret; + %x = load i8* %p + ret i8 %x +} define ptx_device i16 @t1_u16(i16* %p) { entry: @@ -101,6 +120,15 @@ entry: ret double %x } +define ptx_device i8 @t2_u8(i8* %p) { +entry: +;CHECK: ld.global.u8 rq{{[0-9]+}}, [r{{[0-9]+}}+1]; +;CHECK-NEXT: ret; + %i = getelementptr i8* %p, i32 1 + %x = load i8* %i + ret i8 %x +} + define ptx_device i16 @t2_u16(i16* %p) { entry: ;CHECK: ld.global.u16 rh{{[0-9]+}}, [r{{[0-9]+}}+2]; @@ -146,6 +174,15 @@ entry: ret double %x } +define ptx_device i8 @t3_u8(i8* %p, i32 %q) { +entry: +;CHECK: add.u32 r[[R0:[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}; +;CHECK-NEXT: ld.global.u8 rq{{[0-9]+}}, [r[[R0]]]; + %i = getelementptr i8* %p, i32 %q + %x = load i8* %i + ret i8 %x +} + define ptx_device i16 @t3_u16(i16* %p, i32 %q) { entry: ;CHECK: shl.b32 r[[R0:[0-9]+]], r{{[0-9]+}}, 1; @@ -196,6 +233,16 @@ entry: ret double %x } +define ptx_device i8 @t4_global_u8() { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_i8; +;CHECK-NEXT: ld.global.u8 rq{{[0-9]+}}, [r[[R0]]]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8]* @array_i8, i32 0, i32 0 + %x = load i8* %i + ret i8 %x +} + define ptx_device i16 @t4_global_u16() { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_i16; @@ -296,6 +343,16 @@ entry: ret double %x } +define ptx_device i8 @t4_local_u8() { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_local_i8; +;CHECK-NEXT: ld.local.u8 rq{{[0-9]+}}, [r[[R0]]]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8] addrspace(2)* @array_local_i8, i32 0, i32 0 + %x = load i8 addrspace(2)* %i + ret i8 %x +} + define ptx_device i16 @t4_local_u16() { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_local_i16; @@ -346,6 +403,16 @@ entry: ret double %x } +define ptx_device i8 @t4_shared_u8() { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_shared_i8; +;CHECK-NEXT: ld.shared.u8 rq{{[0-9]+}}, [r[[R0]]]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8] addrspace(4)* @array_shared_i8, i32 0, i32 0 + %x = load i8 addrspace(4)* %i + ret i8 %x +} + define ptx_device i16 @t4_shared_u16() { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_shared_i16; @@ -396,6 +463,16 @@ entry: ret double %x } +define ptx_device i8 @t5_u8() { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_i8; +;CHECK-NEXT: ld.global.u8 rq{{[0-9]+}}, [r[[R0]]+1]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8]* @array_i8, i32 0, i32 1 + %x = load i8* %i + ret i8 %x +} + define ptx_device i16 @t5_u16() { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_i16; diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll index cce6a5b8976..b930b4caefb 100644 --- a/test/CodeGen/PTX/mov.ll +++ b/test/CodeGen/PTX/mov.ll @@ -1,5 +1,11 @@ ; RUN: llc < %s -march=ptx32 | FileCheck %s +define ptx_device i8 @t1_u8() { +; CHECK: mov.u8 rq{{[0-9]+}}, 0; +; CHECK: ret; + ret i8 0 +} + define ptx_device i16 @t1_u16() { ; CHECK: mov.u16 rh{{[0-9]+}}, 0; ; CHECK: ret; @@ -30,6 +36,12 @@ define ptx_device double @t1_f64() { ret double 0.0 } +define ptx_device i8 @t2_u8(i8 %x) { +; CHECK: mov.u8 rq{{[0-9]+}}, rq{{[0-9]+}}; +; CHECK: ret; + ret i8 %x +} + define ptx_device i16 @t2_u16(i16 %x) { ; CHECK: mov.u16 rh{{[0-9]+}}, rh{{[0-9]+}}; ; CHECK: ret; diff --git a/test/CodeGen/PTX/st.ll b/test/CodeGen/PTX/st.ll index b08528e1c3c..596d189e4b7 100644 --- a/test/CodeGen/PTX/st.ll +++ b/test/CodeGen/PTX/st.ll @@ -1,5 +1,17 @@ ; RUN: llc < %s -march=ptx32 | FileCheck %s +;CHECK: .extern .global .b8 array_i8[10]; +@array_i8 = external global [10 x i8] + +;CHECK: .extern .const .b8 array_constant_i8[10]; +@array_constant_i8 = external addrspace(1) constant [10 x i8] + +;CHECK: .extern .local .b8 array_local_i8[10]; +@array_local_i8 = external addrspace(2) global [10 x i8] + +;CHECK: .extern .shared .b8 array_shared_i8[10]; +@array_shared_i8 = external addrspace(4) global [10 x i8] + ;CHECK: .extern .global .b8 array_i16[20]; @array_i16 = external global [10 x i16] @@ -60,6 +72,13 @@ ;CHECK: .extern .shared .b8 array_shared_double[80]; @array_shared_double = external addrspace(4) global [10 x double] +define ptx_device void @t1_u8(i8* %p, i8 %x) { +entry: +;CHECK: st.global.u8 [r{{[0-9]+}}], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + store i8 %x, i8* %p + ret void +} define ptx_device void @t1_u16(i16* %p, i16 %x) { entry: @@ -101,6 +120,15 @@ entry: ret void } +define ptx_device void @t2_u8(i8* %p, i8 %x) { +entry: +;CHECK: st.global.u8 [r{{[0-9]+}}+1], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + %i = getelementptr i8* %p, i32 1 + store i8 %x, i8* %i + ret void +} + define ptx_device void @t2_u16(i16* %p, i16 %x) { entry: ;CHECK: st.global.u16 [r{{[0-9]+}}+2], rh{{[0-9]+}}; @@ -146,6 +174,16 @@ entry: ret void } +define ptx_device void @t3_u8(i8* %p, i32 %q, i8 %x) { +entry: +;CHECK: add.u32 r[[R0:[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}; +;CHECK-NEXT: st.global.u8 [r[[R0]]], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + %i = getelementptr i8* %p, i32 %q + store i8 %x, i8* %i + ret void +} + define ptx_device void @t3_u16(i16* %p, i32 %q, i16 %x) { entry: ;CHECK: shl.b32 r[[R0:[0-9]+]], r{{[0-9]+}}, 1; @@ -201,6 +239,16 @@ entry: ret void } +define ptx_device void @t4_global_u8(i8 %x) { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_i8; +;CHECK-NEXT: st.global.u8 [r[[R0]]], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8]* @array_i8, i8 0, i8 0 + store i8 %x, i8* %i + ret void +} + define ptx_device void @t4_global_u16(i16 %x) { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_i16; @@ -251,6 +299,16 @@ entry: ret void } +define ptx_device void @t4_local_u8(i8 %x) { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_local_i8; +;CHECK-NEXT: st.local.u8 [r[[R0]]], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8] addrspace(2)* @array_local_i8, i32 0, i32 0 + store i8 %x, i8 addrspace(2)* %i + ret void +} + define ptx_device void @t4_local_u16(i16 %x) { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_local_i16; @@ -301,6 +359,16 @@ entry: ret void } +define ptx_device void @t4_shared_u8(i8 %x) { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_shared_i8; +;CHECK-NEXT: st.shared.u8 [r[[R0]]], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8] addrspace(4)* @array_shared_i8, i32 0, i32 0 + store i8 %x, i8 addrspace(4)* %i + ret void +} + define ptx_device void @t4_shared_u16(i16 %x) { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_shared_i16; @@ -351,6 +419,16 @@ entry: ret void } +define ptx_device void @t5_u8(i8 %x) { +entry: +;CHECK: mov.u32 r[[R0:[0-9]+]], array_i8; +;CHECK-NEXT: st.global.u8 [r[[R0]]+1], rq{{[0-9]+}}; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i8]* @array_i8, i32 0, i32 1 + store i8 %x, i8* %i + ret void +} + define ptx_device void @t5_u16(i16 %x) { entry: ;CHECK: mov.u32 r[[R0:[0-9]+]], array_i16;