From fd8978b021dbb0b9b09084dcc707c2054ff76280 Mon Sep 17 00:00:00 2001 From: Che-Liang Chiou Date: Wed, 2 Mar 2011 03:20:28 +0000 Subject: [PATCH] Extend initial support for primitive types in PTX backend - Allow i16, i32, i64, float, and double types, using the native .u16, .u32, .u64, .f32, and .f64 PTX types. - Allow loading/storing of all primitive types. - Allow primitive types to be passed as parameters. - Allow selection of PTX Version and Shader Model as sub-target attributes. - Merge integer/floating-point test cases for load/store. - Use .u32 instead of .s32 to conform to output from NVidia nvcc compiler. Patch by Justin Holewinski git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@126824 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PTX/PTX.td | 30 ++- lib/Target/PTX/PTXAsmPrinter.cpp | 66 +++-- lib/Target/PTX/PTXISelDAGToDAG.cpp | 27 +- lib/Target/PTX/PTXISelLowering.cpp | 31 ++- lib/Target/PTX/PTXInstrInfo.cpp | 17 +- lib/Target/PTX/PTXInstrInfo.td | 197 ++++++++------ lib/Target/PTX/PTXMFInfoExtract.cpp | 4 +- lib/Target/PTX/PTXRegisterInfo.td | 131 ++++++++- lib/Target/PTX/PTXSubtarget.cpp | 27 +- lib/Target/PTX/PTXSubtarget.h | 27 +- test/CodeGen/PTX/add.ll | 62 ++++- test/CodeGen/PTX/ld.ll | 404 +++++++++++++++++++++++++--- test/CodeGen/PTX/ld_float.ll | 86 ------ test/CodeGen/PTX/mov.ll | 55 +++- test/CodeGen/PTX/mul.ll | 18 +- test/CodeGen/PTX/options.ll | 7 +- test/CodeGen/PTX/st.ll | 367 +++++++++++++++++++++++-- test/CodeGen/PTX/st_float.ll | 78 ------ test/CodeGen/PTX/sub.ll | 62 ++++- 19 files changed, 1322 insertions(+), 374 deletions(-) delete mode 100644 test/CodeGen/PTX/ld_float.ll delete mode 100644 test/CodeGen/PTX/st_float.ll diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td index 8b1a1b18da5..9f62aa16f82 100644 --- a/lib/Target/PTX/PTX.td +++ b/lib/Target/PTX/PTX.td @@ -19,8 +19,34 @@ include "llvm/Target/Target.td" // Subtarget Features. //===----------------------------------------------------------------------===// -def FeatureSM20 : SubtargetFeature<"sm20", "is_sm20", "true", - "Enable sm_20 target architecture">; +//===- Architectural Features ---------------------------------------------===// + +def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true", + "Do not demote .f64 to .f32">; + +//===- PTX Version --------------------------------------------------------===// + +def FeaturePTX14 : SubtargetFeature<"ptx14", "PTXVersion", "PTX_VERSION_1_4", + "Use PTX Language Version 1.4">; + +def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0", + "Use PTX Language Version 2.0", + [FeaturePTX14]>; + +def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1", + "Use PTX Language Version 2.1", + [FeaturePTX20]>; + +//===- PTX Shader Model ---------------------------------------------------===// + +def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0", + "Enable Shader Model 1.0 compliance">; +def FeatureSM13 : SubtargetFeature<"sm13", "PTXShaderModel", "PTX_SM_1_3", + "Enable Shader Model 1.3 compliance", + [FeatureSM10, FeatureDouble]>; +def FeatureSM20 : SubtargetFeature<"sm20", "PTXShaderModel", "PTX_SM_2_0", + "Enable Shader Model 2.0 compliance", + [FeatureSM13]>; //===----------------------------------------------------------------------===// // PTX supported processors. diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp index 25f26fa4c41..35eeadce2d2 100644 --- a/lib/Target/PTX/PTXAsmPrinter.cpp +++ b/lib/Target/PTX/PTXAsmPrinter.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Target/Mangler.h" @@ -37,13 +38,6 @@ using namespace llvm; -static cl::opt -OptPTXVersion("ptx-version", cl::desc("Set PTX version"), cl::init("1.4")); - -static cl::opt -OptPTXTarget("ptx-target", cl::desc("Set GPU target (comma-separated list)"), - cl::init("sm_10")); - namespace { class PTXAsmPrinter : public AsmPrinter { public: @@ -82,11 +76,14 @@ private: static const char PARAM_PREFIX[] = "__param_"; static const char *getRegisterTypeName(unsigned RegNo) { -#define TEST_REGCLS(cls, clsstr) \ +#define TEST_REGCLS(cls, clsstr) \ if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr; - TEST_REGCLS(RRegf32, f32); - TEST_REGCLS(RRegs32, s32); TEST_REGCLS(Preds, pred); + TEST_REGCLS(RRegu16, u16); + TEST_REGCLS(RRegu32, u32); + TEST_REGCLS(RRegu64, u64); + TEST_REGCLS(RRegf32, f32); + TEST_REGCLS(RRegf64, f64); #undef TEST_REGCLS llvm_unreachable("Not in any register class!"); @@ -121,7 +118,14 @@ static const char *getTypeName(const Type* type) { switch (type->getTypeID()) { default: llvm_unreachable("Unknown type"); case Type::FloatTyID: return ".f32"; - case Type::IntegerTyID: return ".s32"; // TODO: Handle 64-bit types. + case Type::DoubleTyID: return ".f64"; + case Type::IntegerTyID: + switch (type->getPrimitiveSizeInBits()) { + default: llvm_unreachable("Unknown integer bit-width"); + case 16: return ".u16"; + case 32: return ".u32"; + case 64: return ".u64"; + } case Type::ArrayTyID: case Type::PointerTyID: type = dyn_cast(type)->getElementType(); @@ -162,8 +166,11 @@ bool PTXAsmPrinter::doFinalization(Module &M) { void PTXAsmPrinter::EmitStartOfAsmFile(Module &M) { - OutStreamer.EmitRawText(Twine("\t.version " + OptPTXVersion)); - OutStreamer.EmitRawText(Twine("\t.target " + OptPTXTarget)); + const PTXSubtarget& ST = TM.getSubtarget(); + + OutStreamer.EmitRawText(Twine("\t.version " + ST.getPTXVersionString())); + OutStreamer.EmitRawText(Twine("\t.target " + ST.getTargetString() + + (ST.supportsDouble() ? "" : ", map_f64_to_f32"))); OutStreamer.AddBlankLine(); // declare global variables @@ -236,11 +243,24 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, break; case MachineOperand::MO_FPImmediate: APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt(); - if (constFP.getZExtValue() > 0) { - OS << "0F" << constFP.toString(16, false); + bool isFloat = MO.getFPImm()->getType()->getTypeID() == Type::FloatTyID; + // Emit 0F for 32-bit floats and 0D for 64-bit doubles. + if (isFloat) { + OS << "0F"; } else { - OS << "0F00000000"; + OS << "0D"; + } + // Emit the encoded floating-point value. + if (constFP.getZExtValue() > 0) { + OS << constFP.toString(16, false); + } + else { + OS << "00000000"; + // If We have a double-precision zero, pad to 8-bytes. + if (!isFloat) { + OS << "00000000"; + } } break; } @@ -338,12 +358,18 @@ void PTXAsmPrinter::EmitFunctionDeclaration() { if (!MFI->argRegEmpty()) { decl += " ("; if (isKernel) { - for (int i = 0, e = MFI->getNumArg(); i != e; ++i) { - if (i != 0) + unsigned cnt = 0; + //for (int i = 0, e = MFI->getNumArg(); i != e; ++i) { + for(PTXMachineFunctionInfo::reg_iterator + i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; i != e; ++i) { + reg = *i; + assert(reg != PTX::NoRegister && "Not a valid register!"); + if (i != b) decl += ", "; - decl += ".param .s32 "; // TODO: add types + decl += ".param .u32"; // TODO: Parse type from register map + decl += " "; decl += PARAM_PREFIX; - decl += utostr(i + 1); + decl += utostr(++cnt); } } else { for (PTXMachineFunctionInfo::reg_iterator diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp index efb0e8b1af7..1e6a53fee91 100644 --- a/lib/Target/PTX/PTXISelDAGToDAG.cpp +++ b/lib/Target/PTX/PTXISelDAGToDAG.cpp @@ -15,6 +15,7 @@ #include "PTXTargetMachine.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/DerivedTypes.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -66,14 +67,34 @@ SDNode *PTXDAGToDAGISel::Select(SDNode *Node) { } SDNode *PTXDAGToDAGISel::SelectREAD_PARAM(SDNode *Node) { - SDValue index = Node->getOperand(1); - DebugLoc dl = Node->getDebugLoc(); + SDValue index = Node->getOperand(1); + DebugLoc dl = Node->getDebugLoc(); + unsigned opcode; if (index.getOpcode() != ISD::TargetConstant) llvm_unreachable("READ_PARAM: index is not ISD::TargetConstant"); + if (Node->getValueType(0) == MVT::i16) { + opcode = PTX::LDpiU16; + } + else if (Node->getValueType(0) == MVT::i32) { + opcode = PTX::LDpiU32; + } + else if (Node->getValueType(0) == MVT::i64) { + opcode = PTX::LDpiU64; + } + else if (Node->getValueType(0) == MVT::f32) { + opcode = PTX::LDpiF32; + } + else if (Node->getValueType(0) == MVT::f64) { + opcode = PTX::LDpiF64; + } + else { + llvm_unreachable("Unknown parameter type for ld.param"); + } + return PTXInstrInfo:: - GetPTXMachineNode(CurDAG, PTX::LDpi, dl, MVT::i32, index); + GetPTXMachineNode(CurDAG, opcode, dl, Node->getValueType(0), index); } // Match memory operand of the form [reg+reg] diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp index d30c9ecbe49..147b2a82cfc 100644 --- a/lib/Target/PTX/PTXISelLowering.cpp +++ b/lib/Target/PTX/PTXISelLowering.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -27,13 +28,17 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()) { // Set up the register classes. addRegisterClass(MVT::i1, PTX::PredsRegisterClass); - addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass); + addRegisterClass(MVT::i16, PTX::RRegu16RegisterClass); + addRegisterClass(MVT::i32, PTX::RRegu32RegisterClass); + addRegisterClass(MVT::i64, PTX::RRegu64RegisterClass); addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass); - + addRegisterClass(MVT::f64, PTX::RRegf64RegisterClass); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + // Customize translation of memory addresses setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); @@ -90,10 +95,13 @@ struct argmap_entry { bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; } } argmap[] = { argmap_entry(MVT::i1, PTX::PredsRegisterClass), - argmap_entry(MVT::i32, PTX::RRegs32RegisterClass), - argmap_entry(MVT::f32, PTX::RRegf32RegisterClass) + argmap_entry(MVT::i16, PTX::RRegu16RegisterClass), + argmap_entry(MVT::i32, PTX::RRegu32RegisterClass), + argmap_entry(MVT::i64, PTX::RRegu64RegisterClass), + argmap_entry(MVT::f32, PTX::RRegf32RegisterClass), + argmap_entry(MVT::f64, PTX::RRegf64RegisterClass) }; -} // end anonymous namespace +} // end anonymous namespace SDValue PTXTargetLowering:: LowerFormalArguments(SDValue Chain, @@ -192,12 +200,21 @@ SDValue PTXTargetLowering:: SDValue Flag; unsigned reg; - if (Outs[0].VT == MVT::i32) { + if (Outs[0].VT == MVT::i16) { + reg = PTX::RH0; + } + else if (Outs[0].VT == MVT::i32) { reg = PTX::R0; } + else if (Outs[0].VT == MVT::i64) { + reg = PTX::RD0; + } else if (Outs[0].VT == MVT::f32) { reg = PTX::F0; } + else if (Outs[0].VT == MVT::f64) { + reg = PTX::FD0; + } else { assert(false && "Can return only basic types"); } diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp index f2e5e4c1102..7277238c2ec 100644 --- a/lib/Target/PTX/PTXInstrInfo.cpp +++ b/lib/Target/PTX/PTXInstrInfo.cpp @@ -27,9 +27,12 @@ static const struct map_entry { const TargetRegisterClass *cls; const int opcode; } map[] = { - { &PTX::RRegs32RegClass, PTX::MOVrr }, - { &PTX::RRegf32RegClass, PTX::MOVrr }, - { &PTX::PredsRegClass, PTX::MOVpp } + { &PTX::RRegu16RegClass, PTX::MOVU16rr }, + { &PTX::RRegu32RegClass, PTX::MOVU32rr }, + { &PTX::RRegu64RegClass, PTX::MOVU64rr }, + { &PTX::RRegf32RegClass, PTX::MOVF32rr }, + { &PTX::RRegf64RegClass, PTX::MOVF64rr }, + { &PTX::PredsRegClass, PTX::MOVPREDrr } }; void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -76,8 +79,12 @@ bool PTXInstrInfo::isMoveInstr(const MachineInstr& MI, switch (MI.getOpcode()) { default: return false; - case PTX::MOVpp: - case PTX::MOVrr: + case PTX::MOVU16rr: + case PTX::MOVU32rr: + case PTX::MOVU64rr: + case PTX::MOVF32rr: + case PTX::MOVF64rr: + case PTX::MOVPREDrr: assert(MI.getNumOperands() >= 2 && MI.getOperand(0).isReg() && MI.getOperand(1).isReg() && "Invalid register-register move instruction"); diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td index 9d962b0e252..fce6da66eff 100644 --- a/lib/Target/PTX/PTXInstrInfo.td +++ b/lib/Target/PTX/PTXInstrInfo.td @@ -114,7 +114,7 @@ def ADDRii : ComplexPattern; // Address operands def MEMri : Operand { let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops RRegs32, i32imm); + let MIOperandInfo = (ops RRegu32, i32imm); } def MEMii : Operand { let PrintMethod = "printMemOperand"; @@ -143,75 +143,115 @@ def PTXret // Instruction Class Templates //===----------------------------------------------------------------------===// -// Three-operand f32 instruction template +// Three-operand floating-point instruction template multiclass FLOAT3 { - def rr : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, RRegf32:$b), - !strconcat(opcstr, ".%type\t$d, $a, $b"), - [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>; - def ri : InstPTX<(outs RRegf32:$d), - (ins RRegf32:$a, f32imm:$b), - !strconcat(opcstr, ".%type\t$d, $a, $b"), - [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>; + def rr32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, RRegf32:$b), + !strconcat(opcstr, ".f32\t$d, $a, $b"), + [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>; + def ri32 : InstPTX<(outs RRegf32:$d), + (ins RRegf32:$a, f32imm:$b), + !strconcat(opcstr, ".f32\t$d, $a, $b"), + [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>; + def rr64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, RRegf64:$b), + !strconcat(opcstr, ".f64\t$d, $a, $b"), + [(set RRegf64:$d, (opnode RRegf64:$a, RRegf64:$b))]>; + def ri64 : InstPTX<(outs RRegf64:$d), + (ins RRegf64:$a, f64imm:$b), + !strconcat(opcstr, ".f64\t$d, $a, $b"), + [(set RRegf64:$d, (opnode RRegf64:$a, fpimm:$b))]>; } multiclass INT3 { - def rr : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, RRegs32:$b), - !strconcat(opcstr, ".%type\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>; - def ri : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, i32imm:$b), - !strconcat(opcstr, ".%type\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>; + def rr16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, RRegu16:$b), + !strconcat(opcstr, ".u16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>; + def ri16 : InstPTX<(outs RRegu16:$d), + (ins RRegu16:$a, i16imm:$b), + !strconcat(opcstr, ".u16\t$d, $a, $b"), + [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>; + def rr32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, RRegu32:$b), + !strconcat(opcstr, ".u32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; + def ri32 : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, i32imm:$b), + !strconcat(opcstr, ".u32\t$d, $a, $b"), + [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; + def rr64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, RRegu64:$b), + !strconcat(opcstr, ".u64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>; + def ri64 : InstPTX<(outs RRegu64:$d), + (ins RRegu64:$a, i64imm:$b), + !strconcat(opcstr, ".u64\t$d, $a, $b"), + [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>; } // no %type directive, non-communtable multiclass INT3ntnc { - def rr : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, RRegs32:$b), + def rr : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, RRegu32:$b), !strconcat(opcstr, "\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>; - def ri : InstPTX<(outs RRegs32:$d), - (ins RRegs32:$a, i32imm:$b), + [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>; + def ri : InstPTX<(outs RRegu32:$d), + (ins RRegu32:$a, i32imm:$b), !strconcat(opcstr, "\t$d, $a, $b"), - [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>; - def ir : InstPTX<(outs RRegs32:$d), - (ins i32imm:$a, RRegs32:$b), + [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>; + def ir : InstPTX<(outs RRegu32:$d), + (ins i32imm:$a, RRegu32:$b), !strconcat(opcstr, "\t$d, $a, $b"), - [(set RRegs32:$d, (opnode imm:$a, RRegs32:$b))]>; + [(set RRegu32:$d, (opnode imm:$a, RRegu32:$b))]>; } -multiclass PTX_LD { +multiclass PTX_LD { def rr : InstPTX<(outs RC:$d), (ins MEMri:$a), - !strconcat(opstr, ".%type\t$d, [$a]"), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), [(set RC:$d, (pat_load ADDRrr:$a))]>; def ri : InstPTX<(outs RC:$d), (ins MEMri:$a), - !strconcat(opstr, ".%type\t$d, [$a]"), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), [(set RC:$d, (pat_load ADDRri:$a))]>; def ii : InstPTX<(outs RC:$d), (ins MEMii:$a), - !strconcat(opstr, ".%type\t$d, [$a]"), + !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")), [(set RC:$d, (pat_load ADDRii:$a))]>; } -multiclass PTX_ST { +multiclass PTX_LD_ALL { + defm u16 : PTX_LD; + defm u32 : PTX_LD; + defm u64 : PTX_LD; + defm f32 : PTX_LD; + defm f64 : PTX_LD; +} + +multiclass PTX_ST { def rr : InstPTX<(outs), (ins RC:$d, MEMri:$a), - !strconcat(opstr, ".%type\t[$a], $d"), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), [(pat_store RC:$d, ADDRrr:$a)]>; def ri : InstPTX<(outs), (ins RC:$d, MEMri:$a), - !strconcat(opstr, ".%type\t[$a], $d"), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), [(pat_store RC:$d, ADDRri:$a)]>; def ii : InstPTX<(outs), (ins RC:$d, MEMii:$a), - !strconcat(opstr, ".%type\t[$a], $d"), + !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")), [(pat_store RC:$d, ADDRii:$a)]>; } +multiclass PTX_ST_ALL { + defm u16 : PTX_ST; + defm u32 : PTX_ST; + defm u64 : PTX_ST; + defm f32 : PTX_ST; + defm f64 : PTX_ST; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -236,60 +276,67 @@ defm SRA : INT3ntnc<"shr.s32", PTXsra>; ///===- Data Movement and Conversion Instructions -------------------------===// let neverHasSideEffects = 1 in { - // rely on isMoveInstr to separate MOVpp, MOVrr, etc. - def MOVpp + def MOVPREDrr : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>; - def MOVrr - : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>; - def FMOVrr + def MOVU16rr + : InstPTX<(outs RRegu16:$d), (ins RRegu16:$a), "mov.u16\t$d, $a", []>; + def MOVU32rr + : InstPTX<(outs RRegu32:$d), (ins RRegu32:$a), "mov.u32\t$d, $a", []>; + def MOVU64rr + : InstPTX<(outs RRegu64:$d), (ins RRegu64:$a), "mov.u64\t$d, $a", []>; + def MOVF32rr : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>; + def MOVF64rr + : InstPTX<(outs RRegf64:$d), (ins RRegf64:$a), "mov.f64\t$d, $a", []>; } let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - def MOVpi + def MOVPREDri : InstPTX<(outs Preds:$d), (ins i1imm:$a), "mov.pred\t$d, $a", [(set Preds:$d, imm:$a)]>; - def MOVri - : InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a", - [(set RRegs32:$d, imm:$a)]>; - def FMOVri + def MOVU16ri + : InstPTX<(outs RRegu16:$d), (ins i16imm:$a), "mov.u16\t$d, $a", + [(set RRegu16:$d, imm:$a)]>; + def MOVU32ri + : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a", + [(set RRegu32:$d, imm:$a)]>; + def MOVU164ri + : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a", + [(set RRegu64:$d, imm:$a)]>; + def MOVF32ri : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a", [(set RRegf32:$d, fpimm:$a)]>; + def MOVF64ri + : InstPTX<(outs RRegf64:$d), (ins f64imm:$a), "mov.f64\t$d, $a", + [(set RRegf64:$d, fpimm:$a)]>; } -// Integer loads -defm LDg : PTX_LD<"ld.global", RRegs32, load_global>; -defm LDc : PTX_LD<"ld.const", RRegs32, load_constant>; -defm LDl : PTX_LD<"ld.local", RRegs32, load_local>; -defm LDp : PTX_LD<"ld.param", RRegs32, load_parameter>; -defm LDs : PTX_LD<"ld.shared", RRegs32, load_shared>; +// Loads +defm LDg : PTX_LD_ALL<"ld.global", load_global>; +defm LDc : PTX_LD_ALL<"ld.const", load_constant>; +defm LDl : PTX_LD_ALL<"ld.local", load_local>; +defm LDs : PTX_LD_ALL<"ld.shared", load_shared>; -def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a), - "ld.param.%type\t$d, [$a]", []>; +// This is a special instruction that is manually inserted for kernel parameters +def LDpiU16 : InstPTX<(outs RRegu16:$d), (ins MEMpi:$a), + "ld.param.u16\t$d, [$a]", []>; +def LDpiU32 : InstPTX<(outs RRegu32:$d), (ins MEMpi:$a), + "ld.param.u32\t$d, [$a]", []>; +def LDpiU64 : InstPTX<(outs RRegu64:$d), (ins MEMpi:$a), + "ld.param.u64\t$d, [$a]", []>; +def LDpiF32 : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a), + "ld.param.f32\t$d, [$a]", []>; +def LDpiF64 : InstPTX<(outs RRegf64:$d), (ins MEMpi:$a), + "ld.param.f64\t$d, [$a]", []>; -// Floating-point loads -defm FLDg : PTX_LD<"ld.global", RRegf32, load_global>; -defm FLDc : PTX_LD<"ld.const", RRegf32, load_constant>; -defm FLDl : PTX_LD<"ld.local", RRegf32, load_local>; -defm FLDp : PTX_LD<"ld.param", RRegf32, load_parameter>; -defm FLDs : PTX_LD<"ld.shared", RRegf32, load_shared>; +// Stores +defm STg : PTX_ST_ALL<"st.global", store_global>; +defm STl : PTX_ST_ALL<"st.local", store_local>; +defm STs : PTX_ST_ALL<"st.shared", store_shared>; -def FLDpi : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a), - "ld.param.%type\t$d, [$a]", []>; - -// Integer stores -defm STg : PTX_ST<"st.global", RRegs32, store_global>; -defm STl : PTX_ST<"st.local", RRegs32, store_local>; -// Store to parameter state space requires PTX 2.0 or higher? -// defm STp : PTX_ST<"st.param", RRegs32, store_parameter>; -defm STs : PTX_ST<"st.shared", RRegs32, store_shared>; - -// Floating-point stores -defm FSTg : PTX_ST<"st.global", RRegf32, store_global>; -defm FSTl : PTX_ST<"st.local", RRegf32, store_local>; -// Store to parameter state space requires PTX 2.0 or higher? -// defm FSTp : PTX_ST<"st.param", RRegf32, store_parameter>; -defm FSTs : PTX_ST<"st.shared", RRegf32, store_shared>; +// defm STp : PTX_ST_ALL<"st.param", store_parameter>; +// defm LDp : PTX_LD_ALL<"ld.param", load_parameter>; +// TODO: Do something with st.param if/when it is needed. ///===- Control Flow Instructions -----------------------------------------===// diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp index b37c740006f..c5e19100723 100644 --- a/lib/Target/PTX/PTXMFInfoExtract.cpp +++ b/lib/Target/PTX/PTXMFInfoExtract.cpp @@ -79,12 +79,12 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) { DEBUG(for (PTXMachineFunctionInfo::reg_iterator i = MFI->argRegBegin(), e = MFI->argRegEnd(); - i != e; ++i) + i != e; ++i) dbgs() << "Arg Reg: " << *i << "\n";); DEBUG(for (PTXMachineFunctionInfo::reg_iterator i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd(); - i != e; ++i) + i != e; ++i) dbgs() << "Local Var Reg: " << *i << "\n";); return false; diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td index 9158f0d31c7..548e3bbeb98 100644 --- a/lib/Target/PTX/PTXRegisterInfo.td +++ b/lib/Target/PTX/PTXRegisterInfo.td @@ -19,6 +19,8 @@ class PTXReg : Register { // Registers //===----------------------------------------------------------------------===// +///===- Predicate Registers -----------------------------------------------===// + def P0 : PTXReg<"p0">; def P1 : PTXReg<"p1">; def P2 : PTXReg<"p2">; @@ -52,6 +54,43 @@ def P29 : PTXReg<"p29">; def P30 : PTXReg<"p30">; def P31 : PTXReg<"p31">; +///===- 16-bit Integer Registers ------------------------------------------===// + +def RH0 : PTXReg<"rh0">; +def RH1 : PTXReg<"rh1">; +def RH2 : PTXReg<"rh2">; +def RH3 : PTXReg<"rh3">; +def RH4 : PTXReg<"rh4">; +def RH5 : PTXReg<"rh5">; +def RH6 : PTXReg<"rh6">; +def RH7 : PTXReg<"rh7">; +def RH8 : PTXReg<"rh8">; +def RH9 : PTXReg<"rh9">; +def RH10 : PTXReg<"rh10">; +def RH11 : PTXReg<"rh11">; +def RH12 : PTXReg<"rh12">; +def RH13 : PTXReg<"rh13">; +def RH14 : PTXReg<"rh14">; +def RH15 : PTXReg<"rh15">; +def RH16 : PTXReg<"rh16">; +def RH17 : PTXReg<"rh17">; +def RH18 : PTXReg<"rh18">; +def RH19 : PTXReg<"rh19">; +def RH20 : PTXReg<"rh20">; +def RH21 : PTXReg<"rh21">; +def RH22 : PTXReg<"rh22">; +def RH23 : PTXReg<"rh23">; +def RH24 : PTXReg<"rh24">; +def RH25 : PTXReg<"rh25">; +def RH26 : PTXReg<"rh26">; +def RH27 : PTXReg<"rh27">; +def RH28 : PTXReg<"rh28">; +def RH29 : PTXReg<"rh29">; +def RH30 : PTXReg<"rh30">; +def RH31 : PTXReg<"rh31">; + +///===- 32-bit Integer Registers ------------------------------------------===// + def R0 : PTXReg<"r0">; def R1 : PTXReg<"r1">; def R2 : PTXReg<"r2">; @@ -85,6 +124,43 @@ def R29 : PTXReg<"r29">; def R30 : PTXReg<"r30">; def R31 : PTXReg<"r31">; +///===- 64-bit Integer Registers ------------------------------------------===// + +def RD0 : PTXReg<"rd0">; +def RD1 : PTXReg<"rd1">; +def RD2 : PTXReg<"rd2">; +def RD3 : PTXReg<"rd3">; +def RD4 : PTXReg<"rd4">; +def RD5 : PTXReg<"rd5">; +def RD6 : PTXReg<"rd6">; +def RD7 : PTXReg<"rd7">; +def RD8 : PTXReg<"rd8">; +def RD9 : PTXReg<"rd9">; +def RD10 : PTXReg<"rd10">; +def RD11 : PTXReg<"rd11">; +def RD12 : PTXReg<"rd12">; +def RD13 : PTXReg<"rd13">; +def RD14 : PTXReg<"rd14">; +def RD15 : PTXReg<"rd15">; +def RD16 : PTXReg<"rd16">; +def RD17 : PTXReg<"rd17">; +def RD18 : PTXReg<"rd18">; +def RD19 : PTXReg<"rd19">; +def RD20 : PTXReg<"rd20">; +def RD21 : PTXReg<"rd21">; +def RD22 : PTXReg<"rd22">; +def RD23 : PTXReg<"rd23">; +def RD24 : PTXReg<"rd24">; +def RD25 : PTXReg<"rd25">; +def RD26 : PTXReg<"rd26">; +def RD27 : PTXReg<"rd27">; +def RD28 : PTXReg<"rd28">; +def RD29 : PTXReg<"rd29">; +def RD30 : PTXReg<"rd30">; +def RD31 : PTXReg<"rd31">; + +///===- 32-bit Floating-Point Registers -----------------------------------===// + def F0 : PTXReg<"f0">; def F1 : PTXReg<"f1">; def F2 : PTXReg<"f2">; @@ -118,6 +194,41 @@ def F29 : PTXReg<"f29">; def F30 : PTXReg<"f30">; def F31 : PTXReg<"f31">; +///===- 64-bit Floating-Point Registers -----------------------------------===// + +def FD0 : PTXReg<"fd0">; +def FD1 : PTXReg<"fd1">; +def FD2 : PTXReg<"fd2">; +def FD3 : PTXReg<"fd3">; +def FD4 : PTXReg<"fd4">; +def FD5 : PTXReg<"fd5">; +def FD6 : PTXReg<"fd6">; +def FD7 : PTXReg<"fd7">; +def FD8 : PTXReg<"fd8">; +def FD9 : PTXReg<"fd9">; +def FD10 : PTXReg<"fd10">; +def FD11 : PTXReg<"fd11">; +def FD12 : PTXReg<"fd12">; +def FD13 : PTXReg<"fd13">; +def FD14 : PTXReg<"fd14">; +def FD15 : PTXReg<"fd15">; +def FD16 : PTXReg<"fd16">; +def FD17 : PTXReg<"fd17">; +def FD18 : PTXReg<"fd18">; +def FD19 : PTXReg<"fd19">; +def FD20 : PTXReg<"fd20">; +def FD21 : PTXReg<"fd21">; +def FD22 : PTXReg<"fd22">; +def FD23 : PTXReg<"fd23">; +def FD24 : PTXReg<"fd24">; +def FD25 : PTXReg<"fd25">; +def FD26 : PTXReg<"fd26">; +def FD27 : PTXReg<"fd27">; +def FD28 : PTXReg<"fd28">; +def FD29 : PTXReg<"fd29">; +def FD30 : PTXReg<"fd30">; +def FD31 : PTXReg<"fd31">; + //===----------------------------------------------------------------------===// // Register classes @@ -129,14 +240,32 @@ def Preds : RegisterClass<"PTX", [i1], 8, P16, P17, P18, P19, P20, P21, P22, P23, P24, P25, P26, P27, P28, P29, P30, P31]>; -def RRegs32 : RegisterClass<"PTX", [i32], 32, +def RRegu16 : RegisterClass<"PTX", [i16], 16, + [RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7, + RH8, RH9, RH10, RH11, RH12, RH13, RH14, RH15, + RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23, + RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31]>; + +def RRegu32 : RegisterClass<"PTX", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31]>; +def RRegu64 : RegisterClass<"PTX", [i64], 64, + [RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7, + RD8, RD9, RD10, RD11, RD12, RD13, RD14, RD15, + RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23, + RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31]>; + def RRegf32 : RegisterClass<"PTX", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>; + +def RRegf64 : RegisterClass<"PTX", [f64], 64, + [FD0, FD1, FD2, FD3, FD4, FD5, FD6, FD7, + FD8, FD9, FD10, FD11, FD12, FD13, FD14, FD15, + FD16, FD17, FD18, FD19, FD20, FD21, FD22, FD23, + FD24, FD25, FD26, FD27, FD28, FD29, FD30, FD31]>; diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp index 00e2c882a5c..18a93052c99 100644 --- a/lib/Target/PTX/PTXSubtarget.cpp +++ b/lib/Target/PTX/PTXSubtarget.cpp @@ -12,12 +12,33 @@ //===----------------------------------------------------------------------===// #include "PTXSubtarget.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; -PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS) { - std::string TARGET = "sm_20"; - // TODO: call ParseSubtargetFeatures(FS, TARGET); +PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS) + : PTXShaderModel(PTX_SM_1_0), + PTXVersion(PTX_VERSION_1_4) { + std::string TARGET = "generic"; + ParseSubtargetFeatures(FS, TARGET); +} + +std::string PTXSubtarget::getTargetString() const { + switch(PTXShaderModel) { + default: llvm_unreachable("Unknown shader model"); + case PTX_SM_1_0: return "sm_10"; + case PTX_SM_1_3: return "sm_13"; + case PTX_SM_2_0: return "sm_20"; + } +} + +std::string PTXSubtarget::getPTXVersionString() const { + switch(PTXVersion) { + default: llvm_unreachable("Unknown PTX version"); + case PTX_VERSION_1_4: return "1.4"; + case PTX_VERSION_2_0: return "2.0"; + case PTX_VERSION_2_1: return "2.1"; + } } #include "PTXGenSubtarget.inc" diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h index 7fd85f873ae..9a9ada2af6d 100644 --- a/lib/Target/PTX/PTXSubtarget.h +++ b/lib/Target/PTX/PTXSubtarget.h @@ -19,11 +19,36 @@ namespace llvm { class PTXSubtarget : public TargetSubtarget { private: - bool is_sm20; + enum PTXShaderModelEnum { + PTX_SM_1_0, + PTX_SM_1_3, + PTX_SM_2_0 + }; + + enum PTXVersionEnum { + PTX_VERSION_1_4, + PTX_VERSION_2_0, + PTX_VERSION_2_1 + }; + + /// Shader Model supported on the target GPU. + PTXShaderModelEnum PTXShaderModel; + + /// PTX Language Version. + PTXVersionEnum PTXVersion; + + // The native .f64 type is supported on the hardware. + bool SupportsDouble; public: PTXSubtarget(const std::string &TT, const std::string &FS); + std::string getTargetString() const; + + std::string getPTXVersionString() const; + + bool supportsDouble() const { return SupportsDouble; } + std::string ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); }; // class PTXSubtarget diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll index 9e777ae30cb..598591c0fcb 100644 --- a/test/CodeGen/PTX/add.ll +++ b/test/CodeGen/PTX/add.ll @@ -1,29 +1,71 @@ ; RUN: llc < %s -march=ptx | FileCheck %s -define ptx_device i32 @t1(i32 %x, i32 %y) { -; CHECK: add.s32 r0, r1, r2; +define ptx_device i16 @t1_u16(i16 %x, i16 %y) { +; CHECK: add.u16 rh0, rh1, rh2; +; CHECK-NEXT: ret; + %z = add i16 %x, %y + ret i16 %z +} + +define ptx_device i32 @t1_u32(i32 %x, i32 %y) { +; CHECK: add.u32 r0, r1, r2; +; CHECK-NEXT: ret; %z = add i32 %x, %y -; CHECK: ret; ret i32 %z } -define ptx_device i32 @t2(i32 %x) { -; CHECK: add.s32 r0, r1, 1; - %z = add i32 %x, 1 -; CHECK: ret; - ret i32 %z +define ptx_device i64 @t1_u64(i64 %x, i64 %y) { +; CHECK: add.u64 rd0, rd1, rd2; +; CHECK-NEXT: ret; + %z = add i64 %x, %y + ret i64 %z } -define ptx_device float @t3(float %x, float %y) { +define ptx_device float @t1_f32(float %x, float %y) { ; CHECK: add.f32 f0, f1, f2 ; CHECK-NEXT: ret; %z = fadd float %x, %y ret float %z } -define ptx_device float @t4(float %x) { +define ptx_device double @t1_f64(double %x, double %y) { +; CHECK: add.f64 fd0, fd1, fd2 +; CHECK-NEXT: ret; + %z = fadd double %x, %y + ret double %z +} + +define ptx_device i16 @t2_u16(i16 %x) { +; CHECK: add.u16 rh0, rh1, 1; +; CHECK-NEXT: ret; + %z = add i16 %x, 1 + ret i16 %z +} + +define ptx_device i32 @t2_u32(i32 %x) { +; CHECK: add.u32 r0, r1, 1; +; CHECK-NEXT: ret; + %z = add i32 %x, 1 + ret i32 %z +} + +define ptx_device i64 @t2_u64(i64 %x) { +; CHECK: add.u64 rd0, rd1, 1; +; CHECK-NEXT: ret; + %z = add i64 %x, 1 + ret i64 %z +} + +define ptx_device float @t2_f32(float %x) { ; CHECK: add.f32 f0, f1, 0F3F800000; ; CHECK-NEXT: ret; %z = fadd float %x, 1.0 ret float %z } + +define ptx_device double @t2_f64(double %x) { +; CHECK: add.f64 fd0, fd1, 0D3FF0000000000000; +; CHECK-NEXT: ret; + %z = fadd double %x, 1.0 + ret double %z +} diff --git a/test/CodeGen/PTX/ld.ll b/test/CodeGen/PTX/ld.ll index 836c4d41045..e7cc92e3c99 100644 --- a/test/CodeGen/PTX/ld.ll +++ b/test/CodeGen/PTX/ld.ll @@ -1,78 +1,422 @@ ; RUN: llc < %s -march=ptx | FileCheck %s -;CHECK: .extern .global .s32 array[]; -@array = external global [10 x i32] +;CHECK: .extern .global .u16 array_i16[]; +@array_i16 = external global [10 x i16] -;CHECK: .extern .const .s32 array_constant[]; -@array_constant = external addrspace(1) constant [10 x i32] +;CHECK: .extern .const .u16 array_constant_i16[]; +@array_constant_i16 = external addrspace(1) constant [10 x i16] -;CHECK: .extern .local .s32 array_local[]; -@array_local = external addrspace(2) global [10 x i32] +;CHECK: .extern .local .u16 array_local_i16[]; +@array_local_i16 = external addrspace(2) global [10 x i16] -;CHECK: .extern .shared .s32 array_shared[]; -@array_shared = external addrspace(4) global [10 x i32] +;CHECK: .extern .shared .u16 array_shared_i16[]; +@array_shared_i16 = external addrspace(4) global [10 x i16] -define ptx_device i32 @t1(i32* %p) { +;CHECK: .extern .global .u32 array_i32[]; +@array_i32 = external global [10 x i32] + +;CHECK: .extern .const .u32 array_constant_i32[]; +@array_constant_i32 = external addrspace(1) constant [10 x i32] + +;CHECK: .extern .local .u32 array_local_i32[]; +@array_local_i32 = external addrspace(2) global [10 x i32] + +;CHECK: .extern .shared .u32 array_shared_i32[]; +@array_shared_i32 = external addrspace(4) global [10 x i32] + +;CHECK: .extern .global .u64 array_i64[]; +@array_i64 = external global [10 x i64] + +;CHECK: .extern .const .u64 array_constant_i64[]; +@array_constant_i64 = external addrspace(1) constant [10 x i64] + +;CHECK: .extern .local .u64 array_local_i64[]; +@array_local_i64 = external addrspace(2) global [10 x i64] + +;CHECK: .extern .shared .u64 array_shared_i64[]; +@array_shared_i64 = external addrspace(4) global [10 x i64] + +;CHECK: .extern .global .f32 array_float[]; +@array_float = external global [10 x float] + +;CHECK: .extern .const .f32 array_constant_float[]; +@array_constant_float = external addrspace(1) constant [10 x float] + +;CHECK: .extern .local .f32 array_local_float[]; +@array_local_float = external addrspace(2) global [10 x float] + +;CHECK: .extern .shared .f32 array_shared_float[]; +@array_shared_float = external addrspace(4) global [10 x float] + +;CHECK: .extern .global .f64 array_double[]; +@array_double = external global [10 x double] + +;CHECK: .extern .const .f64 array_constant_double[]; +@array_constant_double = external addrspace(1) constant [10 x double] + +;CHECK: .extern .local .f64 array_local_double[]; +@array_local_double = external addrspace(2) global [10 x double] + +;CHECK: .extern .shared .f64 array_shared_double[]; +@array_shared_double = external addrspace(4) global [10 x double] + + +define ptx_device i16 @t1_u16(i16* %p) { entry: -;CHECK: ld.global.s32 r0, [r1]; +;CHECK: ld.global.u16 rh0, [r1]; +;CHECK-NEXT; ret; + %x = load i16* %p + ret i16 %x +} + +define ptx_device i32 @t1_u32(i32* %p) { +entry: +;CHECK: ld.global.u32 r0, [r1]; +;CHECK-NEXT: ret; %x = load i32* %p ret i32 %x } -define ptx_device i32 @t2(i32* %p) { +define ptx_device i64 @t1_u64(i64* %p) { entry: -;CHECK: ld.global.s32 r0, [r1+4]; +;CHECK: ld.global.u64 rd0, [r1]; +;CHECK-NEXT: ret; + %x = load i64* %p + ret i64 %x +} + +define ptx_device float @t1_f32(float* %p) { +entry: +;CHECK: ld.global.f32 f0, [r1]; +;CHECK-NEXT: ret; + %x = load float* %p + ret float %x +} + +define ptx_device double @t1_f64(double* %p) { +entry: +;CHECK: ld.global.f64 fd0, [r1]; +;CHECK-NEXT: ret; + %x = load double* %p + ret double %x +} + +define ptx_device i16 @t2_u16(i16* %p) { +entry: +;CHECK: ld.global.u16 rh0, [r1+2]; +;CHECK-NEXT: ret; + %i = getelementptr i16* %p, i32 1 + %x = load i16* %i + ret i16 %x +} + +define ptx_device i32 @t2_u32(i32* %p) { +entry: +;CHECK: ld.global.u32 r0, [r1+4]; +;CHECK-NEXT: ret; %i = getelementptr i32* %p, i32 1 %x = load i32* %i ret i32 %x } -define ptx_device i32 @t3(i32* %p, i32 %q) { +define ptx_device i64 @t2_u64(i64* %p) { +entry: +;CHECK: ld.global.u64 rd0, [r1+8]; +;CHECK-NEXT: ret; + %i = getelementptr i64* %p, i32 1 + %x = load i64* %i + ret i64 %x +} + +define ptx_device float @t2_f32(float* %p) { +entry: +;CHECK: ld.global.f32 f0, [r1+4]; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 1 + %x = load float* %i + ret float %x +} + +define ptx_device double @t2_f64(double* %p) { +entry: +;CHECK: ld.global.f64 fd0, [r1+8]; +;CHECK-NEXT: ret; + %i = getelementptr double* %p, i32 1 + %x = load double* %i + ret double %x +} + +define ptx_device i16 @t3_u16(i16* %p, i32 %q) { +entry: +;CHECK: shl.b32 r0, r2, 1; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: ld.global.u16 rh0, [r0]; + %i = getelementptr i16* %p, i32 %q + %x = load i16* %i + ret i16 %x +} + +define ptx_device i32 @t3_u32(i32* %p, i32 %q) { entry: ;CHECK: shl.b32 r0, r2, 2; -;CHECK: add.s32 r0, r1, r0; -;CHECK: ld.global.s32 r0, [r0]; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: ld.global.u32 r0, [r0]; %i = getelementptr i32* %p, i32 %q %x = load i32* %i ret i32 %x } -define ptx_device i32 @t4_global() { +define ptx_device i64 @t3_u64(i64* %p, i32 %q) { entry: -;CHECK: ld.global.s32 r0, [array]; - %i = getelementptr [10 x i32]* @array, i32 0, i32 0 +;CHECK: shl.b32 r0, r2, 3; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: ld.global.u64 rd0, [r0]; + %i = getelementptr i64* %p, i32 %q + %x = load i64* %i + ret i64 %x +} + +define ptx_device float @t3_f32(float* %p, i32 %q) { +entry: +;CHECK: shl.b32 r0, r2, 2; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: ld.global.f32 f0, [r0]; + %i = getelementptr float* %p, i32 %q + %x = load float* %i + ret float %x +} + +define ptx_device double @t3_f64(double* %p, i32 %q) { +entry: +;CHECK: shl.b32 r0, r2, 3; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: ld.global.f64 fd0, [r0]; + %i = getelementptr double* %p, i32 %q + %x = load double* %i + ret double %x +} + +define ptx_device i16 @t4_global_u16() { +entry: +;CHECK: ld.global.u16 rh0, [array_i16]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 0 + %x = load i16* %i + ret i16 %x +} + +define ptx_device i32 @t4_global_u32() { +entry: +;CHECK: ld.global.u32 r0, [array_i32]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 0 %x = load i32* %i ret i32 %x } -define ptx_device i32 @t4_const() { +define ptx_device i64 @t4_global_u64() { entry: -;CHECK: ld.const.s32 r0, [array_constant]; - %i = getelementptr [10 x i32] addrspace(1)* @array_constant, i32 0, i32 0 +;CHECK: ld.global.u64 rd0, [array_i64]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 0 + %x = load i64* %i + ret i64 %x +} + +define ptx_device float @t4_global_f32() { +entry: +;CHECK: ld.global.f32 f0, [array_float]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array_float, i32 0, i32 0 + %x = load float* %i + ret float %x +} + +define ptx_device double @t4_global_f64() { +entry: +;CHECK: ld.global.f64 fd0, [array_double]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double]* @array_double, i32 0, i32 0 + %x = load double* %i + ret double %x +} + +define ptx_device i16 @t4_const_u16() { +entry: +;CHECK: ld.const.u16 rh0, [array_constant_i16]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16] addrspace(1)* @array_constant_i16, i32 0, i32 0 + %x = load i16 addrspace(1)* %i + ret i16 %x +} + +define ptx_device i32 @t4_const_u32() { +entry: +;CHECK: ld.const.u32 r0, [array_constant_i32]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32] addrspace(1)* @array_constant_i32, i32 0, i32 0 %x = load i32 addrspace(1)* %i ret i32 %x } -define ptx_device i32 @t4_local() { +define ptx_device i64 @t4_const_u64() { entry: -;CHECK: ld.local.s32 r0, [array_local]; - %i = getelementptr [10 x i32] addrspace(2)* @array_local, i32 0, i32 0 +;CHECK: ld.const.u64 rd0, [array_constant_i64]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64] addrspace(1)* @array_constant_i64, i32 0, i32 0 + %x = load i64 addrspace(1)* %i + ret i64 %x +} + +define ptx_device float @t4_const_f32() { +entry: +;CHECK: ld.const.f32 f0, [array_constant_float]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(1)* @array_constant_float, i32 0, i32 0 + %x = load float addrspace(1)* %i + ret float %x +} + +define ptx_device double @t4_const_f64() { +entry: +;CHECK: ld.const.f64 fd0, [array_constant_double]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double] addrspace(1)* @array_constant_double, i32 0, i32 0 + %x = load double addrspace(1)* %i + ret double %x +} + +define ptx_device i16 @t4_local_u16() { +entry: +;CHECK: ld.local.u16 rh0, [array_local_i16]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16] addrspace(2)* @array_local_i16, i32 0, i32 0 + %x = load i16 addrspace(2)* %i + ret i16 %x +} + +define ptx_device i32 @t4_local_u32() { +entry: +;CHECK: ld.local.u32 r0, [array_local_i32]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32] addrspace(2)* @array_local_i32, i32 0, i32 0 %x = load i32 addrspace(2)* %i ret i32 %x } -define ptx_device i32 @t4_shared() { +define ptx_device i64 @t4_local_u64() { entry: -;CHECK: ld.shared.s32 r0, [array_shared]; - %i = getelementptr [10 x i32] addrspace(4)* @array_shared, i32 0, i32 0 +;CHECK: ld.local.u64 rd0, [array_local_i64]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64] addrspace(2)* @array_local_i64, i32 0, i32 0 + %x = load i64 addrspace(2)* %i + ret i64 %x +} + +define ptx_device float @t4_local_f32() { +entry: +;CHECK: ld.local.f32 f0, [array_local_float]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(2)* @array_local_float, i32 0, i32 0 + %x = load float addrspace(2)* %i + ret float %x +} + +define ptx_device double @t4_local_f64() { +entry: +;CHECK: ld.local.f64 fd0, [array_local_double]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double] addrspace(2)* @array_local_double, i32 0, i32 0 + %x = load double addrspace(2)* %i + ret double %x +} + +define ptx_device i16 @t4_shared_u16() { +entry: +;CHECK: ld.shared.u16 rh0, [array_shared_i16]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16] addrspace(4)* @array_shared_i16, i32 0, i32 0 + %x = load i16 addrspace(4)* %i + ret i16 %x +} + +define ptx_device i32 @t4_shared_u32() { +entry: +;CHECK: ld.shared.u32 r0, [array_shared_i32]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32] addrspace(4)* @array_shared_i32, i32 0, i32 0 %x = load i32 addrspace(4)* %i ret i32 %x } -define ptx_device i32 @t5() { +define ptx_device i64 @t4_shared_u64() { entry: -;CHECK: ld.global.s32 r0, [array+4]; - %i = getelementptr [10 x i32]* @array, i32 0, i32 1 +;CHECK: ld.shared.u64 rd0, [array_shared_i64]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64] addrspace(4)* @array_shared_i64, i32 0, i32 0 + %x = load i64 addrspace(4)* %i + ret i64 %x +} + +define ptx_device float @t4_shared_f32() { +entry: +;CHECK: ld.shared.f32 f0, [array_shared_float]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0 + %x = load float addrspace(4)* %i + ret float %x +} + +define ptx_device double @t4_shared_f64() { +entry: +;CHECK: ld.shared.f64 fd0, [array_shared_double]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0 + %x = load double addrspace(4)* %i + ret double %x +} + +define ptx_device i16 @t5_u16() { +entry: +;CHECK: ld.global.u16 rh0, [array_i16+2]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 1 + %x = load i16* %i + ret i16 %x +} + +define ptx_device i32 @t5_u32() { +entry: +;CHECK: ld.global.u32 r0, [array_i32+4]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 1 %x = load i32* %i ret i32 %x } + +define ptx_device i64 @t5_u64() { +entry: +;CHECK: ld.global.u64 rd0, [array_i64+8]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 1 + %x = load i64* %i + ret i64 %x +} + +define ptx_device float @t5_f32() { +entry: +;CHECK: ld.global.f32 f0, [array_float+4]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array_float, i32 0, i32 1 + %x = load float* %i + ret float %x +} + +define ptx_device double @t5_f64() { +entry: +;CHECK: ld.global.f64 fd0, [array_double+8]; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double]* @array_double, i32 0, i32 1 + %x = load double* %i + ret double %x +} diff --git a/test/CodeGen/PTX/ld_float.ll b/test/CodeGen/PTX/ld_float.ll deleted file mode 100644 index 62d2c36e64a..00000000000 --- a/test/CodeGen/PTX/ld_float.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -march=ptx | FileCheck %s - -;CHECK: .extern .global .f32 array[]; -@array = external global [10 x float] - -;CHECK: .extern .const .f32 array_constant[]; -@array_constant = external addrspace(1) constant [10 x float] - -;CHECK: .extern .local .f32 array_local[]; -@array_local = external addrspace(2) global [10 x float] - -;CHECK: .extern .shared .f32 array_shared[]; -@array_shared = external addrspace(4) global [10 x float] - -define ptx_device float @t1(float* %p) { -entry: -;CHECK: ld.global.f32 f0, [r1]; -;CHECK-NEXT: ret; - %x = load float* %p - ret float %x -} - -define ptx_device float @t2(float* %p) { -entry: -;CHECK: ld.global.f32 f0, [r1+4]; -;CHECK-NEXT: ret; - %i = getelementptr float* %p, i32 1 - %x = load float* %i - ret float %x -} - -define ptx_device float @t3(float* %p, i32 %q) { -entry: -;CHECK: shl.b32 r0, r2, 2; -;CHECK-NEXT: add.s32 r0, r1, r0; -;CHECK-NEXT: ld.global.f32 f0, [r0]; -;CHECK-NEXT: ret; - %i = getelementptr float* %p, i32 %q - %x = load float* %i - ret float %x -} - -define ptx_device float @t4_global() { -entry: -;CHECK: ld.global.f32 f0, [array]; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float]* @array, i32 0, i32 0 - %x = load float* %i - ret float %x -} - -define ptx_device float @t4_const() { -entry: -;CHECK: ld.const.f32 f0, [array_constant]; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float] addrspace(1)* @array_constant, i32 0, i32 0 - %x = load float addrspace(1)* %i - ret float %x -} - -define ptx_device float @t4_local() { -entry: -;CHECK: ld.local.f32 f0, [array_local]; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0 - %x = load float addrspace(2)* %i - ret float %x -} - -define ptx_device float @t4_shared() { -entry: -;CHECK: ld.shared.f32 f0, [array_shared]; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0 - %x = load float addrspace(4)* %i - ret float %x -} - -define ptx_device float @t5() { -entry: -;CHECK: ld.global.f32 f0, [array+4]; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float]* @array, i32 0, i32 1 - %x = load float* %i - ret float %x -} diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll index d201a7867aa..00dcf19f1da 100644 --- a/test/CodeGen/PTX/mov.ll +++ b/test/CodeGen/PTX/mov.ll @@ -1,25 +1,62 @@ ; RUN: llc < %s -march=ptx | FileCheck %s -define ptx_device i32 @t1() { -; CHECK: mov.s32 r0, 0; +define ptx_device i16 @t1_u16() { +; CHECK: mov.u16 rh0, 0; +; CHECK: ret; + ret i16 0 +} + +define ptx_device i32 @t1_u32() { +; CHECK: mov.u32 r0, 0; ; CHECK: ret; ret i32 0 } -define ptx_device i32 @t2(i32 %x) { -; CHECK: mov.s32 r0, r1; +define ptx_device i64 @t1_u64() { +; CHECK: mov.u64 rd0, 0; +; CHECK: ret; + ret i64 0 +} + +define ptx_device float @t1_f32() { +; CHECK: mov.f32 f0, 0F00000000; +; CHECK: ret; + ret float 0.0 +} + +define ptx_device double @t1_f64() { +; CHECK: mov.f64 fd0, 0D0000000000000000; +; CHECK: ret; + ret double 0.0 +} + +define ptx_device i16 @t2_u16(i16 %x) { +; CHECK: mov.u16 rh0, rh1; +; CHECK: ret; + ret i16 %x +} + +define ptx_device i32 @t2_u32(i32 %x) { +; CHECK: mov.u32 r0, r1; ; CHECK: ret; ret i32 %x } -define ptx_device float @t3() { -; CHECK: mov.f32 f0, 0F00000000; -; CHECK-NEXT: ret; - ret float 0.0 +define ptx_device i64 @t2_u64(i64 %x) { +; CHECK: mov.u64 rd0, rd1; +; CHECK: ret; + ret i64 %x } -define ptx_device float @t4(float %x) { +define ptx_device float @t3_f32(float %x) { ; CHECK: mov.f32 f0, f1; ; CHECK-NEXT: ret; ret float %x } + +define ptx_device double @t3_f64(double %x) { +; CHECK: mov.f64 fd0, fd1; +; CHECK-NEXT: ret; + ret double %x +} + diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll index 01871da4893..fd0788fce66 100644 --- a/test/CodeGen/PTX/mul.ll +++ b/test/CodeGen/PTX/mul.ll @@ -10,16 +10,30 @@ ; ret i32 %z ;} -define ptx_device float @t3(float %x, float %y) { +define ptx_device float @t1_f32(float %x, float %y) { ; CHECK: mul.f32 f0, f1, f2 ; CHECK-NEXT: ret; %z = fmul float %x, %y ret float %z } -define ptx_device float @t4(float %x) { +define ptx_device double @t1_f64(double %x, double %y) { +; CHECK: mul.f64 fd0, fd1, fd2 +; CHECK-NEXT: ret; + %z = fmul double %x, %y + ret double %z +} + +define ptx_device float @t2_f32(float %x) { ; CHECK: mul.f32 f0, f1, 0F40A00000; ; CHECK-NEXT: ret; %z = fmul float %x, 5.0 ret float %z } + +define ptx_device double @t2_f64(double %x) { +; CHECK: mul.f64 fd0, fd1, 0D4014000000000000; +; CHECK-NEXT: ret; + %z = fmul double %x, 5.0 + ret double %z +} diff --git a/test/CodeGen/PTX/options.ll b/test/CodeGen/PTX/options.ll index a14d5c9c27b..1435537e007 100644 --- a/test/CodeGen/PTX/options.ll +++ b/test/CodeGen/PTX/options.ll @@ -1,5 +1,8 @@ -; RUN: llc < %s -march=ptx -ptx-version=2.0 | grep ".version 2.0" -; RUN: llc < %s -march=ptx -ptx-target=sm_20 | grep ".target sm_20" +; RUN: llc < %s -march=ptx -mattr=ptx14 | grep ".version 1.4" +; RUN: llc < %s -march=ptx -mattr=ptx20 | grep ".version 2.0" +; RUN: llc < %s -march=ptx -mattr=ptx21 | grep ".version 2.1" +; RUN: llc < %s -march=ptx -mattr=sm20 | grep ".target sm_20" +; RUN: llc < %s -march=ptx -mattr=sm13 | grep ".target sm_13" define ptx_device void @t1() { ret void diff --git a/test/CodeGen/PTX/st.ll b/test/CodeGen/PTX/st.ll index 2cbacb9ee59..bbe89a10648 100644 --- a/test/CodeGen/PTX/st.ll +++ b/test/CodeGen/PTX/st.ll @@ -1,71 +1,382 @@ ; RUN: llc < %s -march=ptx | FileCheck %s -;CHECK: .extern .global .s32 array[]; -@array = external global [10 x i32] +;CHECK: .extern .global .u16 array_i16[]; +@array_i16 = external global [10 x i16] -;CHECK: .extern .const .s32 array_constant[]; -@array_constant = external addrspace(1) constant [10 x i32] +;CHECK: .extern .const .u16 array_constant_i16[]; +@array_constant_i16 = external addrspace(1) constant [10 x i16] -;CHECK: .extern .local .s32 array_local[]; -@array_local = external addrspace(2) global [10 x i32] +;CHECK: .extern .local .u16 array_local_i16[]; +@array_local_i16 = external addrspace(2) global [10 x i16] -;CHECK: .extern .shared .s32 array_shared[]; -@array_shared = external addrspace(4) global [10 x i32] +;CHECK: .extern .shared .u16 array_shared_i16[]; +@array_shared_i16 = external addrspace(4) global [10 x i16] -define ptx_device void @t1(i32* %p, i32 %x) { +;CHECK: .extern .global .u32 array_i32[]; +@array_i32 = external global [10 x i32] + +;CHECK: .extern .const .u32 array_constant_i32[]; +@array_constant_i32 = external addrspace(1) constant [10 x i32] + +;CHECK: .extern .local .u32 array_local_i32[]; +@array_local_i32 = external addrspace(2) global [10 x i32] + +;CHECK: .extern .shared .u32 array_shared_i32[]; +@array_shared_i32 = external addrspace(4) global [10 x i32] + +;CHECK: .extern .global .u64 array_i64[]; +@array_i64 = external global [10 x i64] + +;CHECK: .extern .const .u64 array_constant_i64[]; +@array_constant_i64 = external addrspace(1) constant [10 x i64] + +;CHECK: .extern .local .u64 array_local_i64[]; +@array_local_i64 = external addrspace(2) global [10 x i64] + +;CHECK: .extern .shared .u64 array_shared_i64[]; +@array_shared_i64 = external addrspace(4) global [10 x i64] + +;CHECK: .extern .global .f32 array_float[]; +@array_float = external global [10 x float] + +;CHECK: .extern .const .f32 array_constant_float[]; +@array_constant_float = external addrspace(1) constant [10 x float] + +;CHECK: .extern .local .f32 array_local_float[]; +@array_local_float = external addrspace(2) global [10 x float] + +;CHECK: .extern .shared .f32 array_shared_float[]; +@array_shared_float = external addrspace(4) global [10 x float] + +;CHECK: .extern .global .f64 array_double[]; +@array_double = external global [10 x double] + +;CHECK: .extern .const .f64 array_constant_double[]; +@array_constant_double = external addrspace(1) constant [10 x double] + +;CHECK: .extern .local .f64 array_local_double[]; +@array_local_double = external addrspace(2) global [10 x double] + +;CHECK: .extern .shared .f64 array_shared_double[]; +@array_shared_double = external addrspace(4) global [10 x double] + + +define ptx_device void @t1_u16(i16* %p, i16 %x) { entry: -;CHECK: st.global.s32 [r1], r2; +;CHECK: st.global.u16 [r1], rh1; +;CHECK-NEXT: ret; + store i16 %x, i16* %p + ret void +} + +define ptx_device void @t1_u32(i32* %p, i32 %x) { +entry: +;CHECK: st.global.u32 [r1], r2; +;CHECK-NEXT: ret; store i32 %x, i32* %p ret void } -define ptx_device void @t2(i32* %p, i32 %x) { +define ptx_device void @t1_u64(i64* %p, i64 %x) { entry: -;CHECK: st.global.s32 [r1+4], r2; +;CHECK: st.global.u64 [r1], rd1; +;CHECK-NEXT: ret; + store i64 %x, i64* %p + ret void +} + +define ptx_device void @t1_f32(float* %p, float %x) { +entry: +;CHECK: st.global.f32 [r1], f1; +;CHECK-NEXT: ret; + store float %x, float* %p + ret void +} + +define ptx_device void @t1_f64(double* %p, double %x) { +entry: +;CHECK: st.global.f64 [r1], fd1; +;CHECK-NEXT: ret; + store double %x, double* %p + ret void +} + +define ptx_device void @t2_u16(i16* %p, i16 %x) { +entry: +;CHECK: st.global.u16 [r1+2], rh1; +;CHECK-NEXT: ret; + %i = getelementptr i16* %p, i32 1 + store i16 %x, i16* %i + ret void +} + +define ptx_device void @t2_u32(i32* %p, i32 %x) { +entry: +;CHECK: st.global.u32 [r1+4], r2; +;CHECK-NEXT: ret; %i = getelementptr i32* %p, i32 1 store i32 %x, i32* %i ret void } -define ptx_device void @t3(i32* %p, i32 %q, i32 %x) { -;CHECK: .reg .s32 r0; +define ptx_device void @t2_u64(i64* %p, i64 %x) { +entry: +;CHECK: st.global.u64 [r1+8], rd1; +;CHECK-NEXT: ret; + %i = getelementptr i64* %p, i32 1 + store i64 %x, i64* %i + ret void +} + +define ptx_device void @t2_f32(float* %p, float %x) { +entry: +;CHECK: st.global.f32 [r1+4], f1; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 1 + store float %x, float* %i + ret void +} + +define ptx_device void @t2_f64(double* %p, double %x) { +entry: +;CHECK: st.global.f64 [r1+8], fd1; +;CHECK-NEXT: ret; + %i = getelementptr double* %p, i32 1 + store double %x, double* %i + ret void +} + +define ptx_device void @t3_u16(i16* %p, i32 %q, i16 %x) { +entry: +;CHECK: shl.b32 r0, r2, 1; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: st.global.u16 [r0], rh1; +;CHECK-NEXT: ret; + %i = getelementptr i16* %p, i32 %q + store i16 %x, i16* %i + ret void +} + +define ptx_device void @t3_u32(i32* %p, i32 %q, i32 %x) { entry: ;CHECK: shl.b32 r0, r2, 2; -;CHECK: add.s32 r0, r1, r0; -;CHECK: st.global.s32 [r0], r3; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: st.global.u32 [r0], r3; +;CHECK-NEXT: ret; %i = getelementptr i32* %p, i32 %q store i32 %x, i32* %i ret void } -define ptx_device void @t4_global(i32 %x) { +define ptx_device void @t3_u64(i64* %p, i32 %q, i64 %x) { entry: -;CHECK: st.global.s32 [array], r1; - %i = getelementptr [10 x i32]* @array, i32 0, i32 0 +;CHECK: shl.b32 r0, r2, 3; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: st.global.u64 [r0], rd1; +;CHECK-NEXT: ret; + %i = getelementptr i64* %p, i32 %q + store i64 %x, i64* %i + ret void +} + +define ptx_device void @t3_f32(float* %p, i32 %q, float %x) { +entry: +;CHECK: shl.b32 r0, r2, 2; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: st.global.f32 [r0], f1; +;CHECK-NEXT: ret; + %i = getelementptr float* %p, i32 %q + store float %x, float* %i + ret void +} + +define ptx_device void @t3_f64(double* %p, i32 %q, double %x) { +entry: +;CHECK: shl.b32 r0, r2, 3; +;CHECK-NEXT: add.u32 r0, r1, r0; +;CHECK-NEXT: st.global.f64 [r0], fd1; +;CHECK-NEXT: ret; + %i = getelementptr double* %p, i32 %q + store double %x, double* %i + ret void +} + +define ptx_device void @t4_global_u16(i16 %x) { +entry: +;CHECK: st.global.u16 [array_i16], rh1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16]* @array_i16, i16 0, i16 0 + store i16 %x, i16* %i + ret void +} + +define ptx_device void @t4_global_u32(i32 %x) { +entry: +;CHECK: st.global.u32 [array_i32], r1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 0 store i32 %x, i32* %i ret void } -define ptx_device void @t4_local(i32 %x) { +define ptx_device void @t4_global_u64(i64 %x) { entry: -;CHECK: st.local.s32 [array_local], r1; - %i = getelementptr [10 x i32] addrspace(2)* @array_local, i32 0, i32 0 +;CHECK: st.global.u64 [array_i64], rd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 0 + store i64 %x, i64* %i + ret void +} + +define ptx_device void @t4_global_f32(float %x) { +entry: +;CHECK: st.global.f32 [array_float], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array_float, i32 0, i32 0 + store float %x, float* %i + ret void +} + +define ptx_device void @t4_global_f64(double %x) { +entry: +;CHECK: st.global.f64 [array_double], fd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double]* @array_double, i32 0, i32 0 + store double %x, double* %i + ret void +} + +define ptx_device void @t4_local_u16(i16 %x) { +entry: +;CHECK: st.local.u16 [array_local_i16], rh1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16] addrspace(2)* @array_local_i16, i32 0, i32 0 + store i16 %x, i16 addrspace(2)* %i + ret void +} + +define ptx_device void @t4_local_u32(i32 %x) { +entry: +;CHECK: st.local.u32 [array_local_i32], r1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32] addrspace(2)* @array_local_i32, i32 0, i32 0 store i32 %x, i32 addrspace(2)* %i ret void } -define ptx_device void @t4_shared(i32 %x) { +define ptx_device void @t4_local_u64(i64 %x) { entry: -;CHECK: st.shared.s32 [array_shared], r1; - %i = getelementptr [10 x i32] addrspace(4)* @array_shared, i32 0, i32 0 +;CHECK: st.local.u64 [array_local_i64], rd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64] addrspace(2)* @array_local_i64, i32 0, i32 0 + store i64 %x, i64 addrspace(2)* %i + ret void +} + +define ptx_device void @t4_local_f32(float %x) { +entry: +;CHECK: st.local.f32 [array_local_float], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(2)* @array_local_float, i32 0, i32 0 + store float %x, float addrspace(2)* %i + ret void +} + +define ptx_device void @t4_local_f64(double %x) { +entry: +;CHECK: st.local.f64 [array_local_double], fd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double] addrspace(2)* @array_local_double, i32 0, i32 0 + store double %x, double addrspace(2)* %i + ret void +} + +define ptx_device void @t4_shared_u16(i16 %x) { +entry: +;CHECK: st.shared.u16 [array_shared_i16], rh1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16] addrspace(4)* @array_shared_i16, i32 0, i32 0 + store i16 %x, i16 addrspace(4)* %i + ret void +} + +define ptx_device void @t4_shared_u32(i32 %x) { +entry: +;CHECK: st.shared.u32 [array_shared_i32], r1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32] addrspace(4)* @array_shared_i32, i32 0, i32 0 store i32 %x, i32 addrspace(4)* %i ret void } -define ptx_device void @t5(i32 %x) { +define ptx_device void @t4_shared_u64(i64 %x) { entry: -;CHECK: st.global.s32 [array+4], r1; - %i = getelementptr [10 x i32]* @array, i32 0, i32 1 +;CHECK: st.shared.u64 [array_shared_i64], rd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64] addrspace(4)* @array_shared_i64, i32 0, i32 0 + store i64 %x, i64 addrspace(4)* %i + ret void +} + +define ptx_device void @t4_shared_f32(float %x) { +entry: +;CHECK: st.shared.f32 [array_shared_float], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0 + store float %x, float addrspace(4)* %i + ret void +} + +define ptx_device void @t4_shared_f64(double %x) { +entry: +;CHECK: st.shared.f64 [array_shared_double], fd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0 + store double %x, double addrspace(4)* %i + ret void +} + +define ptx_device void @t5_u16(i16 %x) { +entry: +;CHECK: st.global.u16 [array_i16+2], rh1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 1 + store i16 %x, i16* %i + ret void +} + +define ptx_device void @t5_u32(i32 %x) { +entry: +;CHECK: st.global.u32 [array_i32+4], r1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 1 store i32 %x, i32* %i ret void } + +define ptx_device void @t5_u64(i64 %x) { +entry: +;CHECK: st.global.u64 [array_i64+8], rd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 1 + store i64 %x, i64* %i + ret void +} + +define ptx_device void @t5_f32(float %x) { +entry: +;CHECK: st.global.f32 [array_float+4], f1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x float]* @array_float, i32 0, i32 1 + store float %x, float* %i + ret void +} + +define ptx_device void @t5_f64(double %x) { +entry: +;CHECK: st.global.f64 [array_double+8], fd1; +;CHECK-NEXT: ret; + %i = getelementptr [10 x double]* @array_double, i32 0, i32 1 + store double %x, double* %i + ret void +} diff --git a/test/CodeGen/PTX/st_float.ll b/test/CodeGen/PTX/st_float.ll deleted file mode 100644 index f0e00105f93..00000000000 --- a/test/CodeGen/PTX/st_float.ll +++ /dev/null @@ -1,78 +0,0 @@ -; RUN: llc < %s -march=ptx | FileCheck %s - -;CHECK: .extern .global .f32 array[]; -@array = external global [10 x float] - -;CHECK: .extern .const .f32 array_constant[]; -@array_constant = external addrspace(1) constant [10 x float] - -;CHECK: .extern .local .f32 array_local[]; -@array_local = external addrspace(2) global [10 x float] - -;CHECK: .extern .shared .f32 array_shared[]; -@array_shared = external addrspace(4) global [10 x float] - -define ptx_device void @t1(float* %p, float %x) { -entry: -;CHECK: st.global.f32 [r1], f1; -;CHECK-NEXT: ret; - store float %x, float* %p - ret void -} - -define ptx_device void @t2(float* %p, float %x) { -entry: -;CHECK: st.global.f32 [r1+4], f1; -;CHECK-NEXT: ret; - %i = getelementptr float* %p, i32 1 - store float %x, float* %i - ret void -} - -define ptx_device void @t3(float* %p, i32 %q, float %x) { -;CHECK: .reg .s32 r0; -entry: -;CHECK: shl.b32 r0, r2, 2; -;CHECK-NEXT: add.s32 r0, r1, r0; -;CHECK-NEXT: st.global.f32 [r0], f1; -;CHECK-NEXT: ret; - %i = getelementptr float* %p, i32 %q - store float %x, float* %i - ret void -} - -define ptx_device void @t4_global(float %x) { -entry: -;CHECK: st.global.f32 [array], f1; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float]* @array, i32 0, i32 0 - store float %x, float* %i - ret void -} - -define ptx_device void @t4_local(float %x) { -entry: -;CHECK: st.local.f32 [array_local], f1; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0 - store float %x, float addrspace(2)* %i - ret void -} - -define ptx_device void @t4_shared(float %x) { -entry: -;CHECK: st.shared.f32 [array_shared], f1; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0 - store float %x, float addrspace(4)* %i - ret void -} - -define ptx_device void @t5(float %x) { -entry: -;CHECK: st.global.f32 [array+4], f1; -;CHECK-NEXT: ret; - %i = getelementptr [10 x float]* @array, i32 0, i32 1 - store float %x, float* %i - ret void -} diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll index e11decaf5cf..4810e4fc055 100644 --- a/test/CodeGen/PTX/sub.ll +++ b/test/CodeGen/PTX/sub.ll @@ -1,29 +1,71 @@ ; RUN: llc < %s -march=ptx | FileCheck %s -define ptx_device i32 @t1(i32 %x, i32 %y) { -;CHECK: sub.s32 r0, r1, r2; +define ptx_device i16 @t1_u16(i16 %x, i16 %y) { +; CHECK: sub.u16 rh0, rh1, rh2; +; CHECK-NEXT: ret; + %z = sub i16 %x, %y + ret i16 %z +} + +define ptx_device i32 @t1_u32(i32 %x, i32 %y) { +; CHECK: sub.u32 r0, r1, r2; +; CHECK-NEXT: ret; %z = sub i32 %x, %y -;CHECK: ret; ret i32 %z } -define ptx_device i32 @t2(i32 %x) { -;CHECK: add.s32 r0, r1, -1; - %z = sub i32 %x, 1 -;CHECK: ret; - ret i32 %z +define ptx_device i64 @t1_u64(i64 %x, i64 %y) { +; CHECK: sub.u64 rd0, rd1, rd2; +; CHECK-NEXT: ret; + %z = sub i64 %x, %y + ret i64 %z } -define ptx_device float @t3(float %x, float %y) { +define ptx_device float @t1_f32(float %x, float %y) { ; CHECK: sub.f32 f0, f1, f2 ; CHECK-NEXT: ret; %z = fsub float %x, %y ret float %z } -define ptx_device float @t4(float %x) { +define ptx_device double @t1_f64(double %x, double %y) { +; CHECK: sub.f64 fd0, fd1, fd2 +; CHECK-NEXT: ret; + %z = fsub double %x, %y + ret double %z +} + +define ptx_device i16 @t2_u16(i16 %x) { +; CHECK: add.u16 rh0, rh1, -1; +; CHECK-NEXT: ret; + %z = sub i16 %x, 1 + ret i16 %z +} + +define ptx_device i32 @t2_u32(i32 %x) { +; CHECK: add.u32 r0, r1, -1; +; CHECK-NEXT: ret; + %z = sub i32 %x, 1 + ret i32 %z +} + +define ptx_device i64 @t2_u64(i64 %x) { +; CHECK: add.u64 rd0, rd1, -1; +; CHECK-NEXT: ret; + %z = sub i64 %x, 1 + ret i64 %z +} + +define ptx_device float @t2_f32(float %x) { ; CHECK: add.f32 f0, f1, 0FBF800000; ; CHECK-NEXT: ret; %z = fsub float %x, 1.0 ret float %z } + +define ptx_device double @t2_f64(double %x) { +; CHECK: add.f64 fd0, fd1, 0DBFF0000000000000; +; CHECK-NEXT: ret; + %z = fsub double %x, 1.0 + ret double %z +}