From 5443e7d79044f3198f2da044f1b389b40d9bea6f Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Fri, 31 May 2013 12:14:49 +0000 Subject: [PATCH] [NVPTX] Re-enable support for virtual registers in the final output Now that 3.3 is branched, we are re-enabling virtual registers to help iron out bugs before the next release. Some of the post-RA passes do not play well with virtual registers, so we disable them for now. The needed functionality of the PrologEpilogInserter pass is copied to a new backend-specific NVPTXPrologEpilog pass. The test for this commit is not breaking the existing tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182998 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/CMakeLists.txt | 1 + lib/Target/NVPTX/NVPTX.h | 2 + lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 61 +++--- lib/Target/NVPTX/NVPTXAsmPrinter.h | 5 +- lib/Target/NVPTX/NVPTXInstrInfo.cpp | 46 ++--- lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 225 +++++++++++++++++++++ lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 4 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 27 +++ test/CodeGen/NVPTX/intrinsic-old.ll | 66 +++--- test/CodeGen/NVPTX/intrinsics.ll | 4 +- 10 files changed, 341 insertions(+), 100 deletions(-) create mode 100644 lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 735ca9b6b76..a8293da7607 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -24,6 +24,7 @@ set(NVPTXCodeGen_sources NVPTXUtilities.cpp NVVMReflect.cpp NVPTXGenericToNVVM.cpp + NVPTXPrologEpilogPass.cpp ) add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources}) diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 072c65da35c..179dc277f4e 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -27,6 +27,7 @@ namespace llvm { class NVPTXTargetMachine; class FunctionPass; +class MachineFunctionPass; class formatted_raw_ostream; namespace NVPTXCC { @@ -66,6 +67,7 @@ FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &); ModulePass *createGenericToNVVMPass(); ModulePass *createNVVMReflectPass(); ModulePass *createNVVMReflectPass(const StringMap& Mapping); +MachineFunctionPass *createNVPTXPrologEpilogPass(); bool isImageOrSamplerVal(const Value *, const Module *); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 6cc52bda98d..44f357d0845 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -436,9 +436,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { } void NVPTXAsmPrinter::EmitFunctionBodyStart() { - const TargetRegisterInfo &TRI = *TM.getRegisterInfo(); - unsigned numRegClasses = TRI.getNumRegClasses(); - VRidGlobal2LocalMap = new std::map[numRegClasses + 1]; + VRegMapping.clear(); OutStreamer.EmitRawText(StringRef("{\n")); setAndEmitFunctionVirtualRegisters(*MF); @@ -450,7 +448,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() { void NVPTXAsmPrinter::EmitFunctionBodyEnd() { OutStreamer.EmitRawText(StringRef("}\n")); - delete[] VRidGlobal2LocalMap; + VRegMapping.clear(); } void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, @@ -507,9 +505,8 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec, raw_ostream &O) { const TargetRegisterClass *RC = MRI->getRegClass(vr); - unsigned id = RC->getID(); - std::map ®map = VRidGlobal2LocalMap[id]; + DenseMap ®map = VRegMapping[RC]; unsigned mapped_vr = regmap[vr]; if (!isVec) { @@ -1709,48 +1706,36 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( for (unsigned i = 0; i < numVRs; i++) { unsigned int vr = TRI->index2VirtReg(i); const TargetRegisterClass *RC = MRI->getRegClass(vr); - std::map ®map = VRidGlobal2LocalMap[RC->getID()]; + DenseMap ®map = VRegMapping[RC]; int n = regmap.size(); regmap.insert(std::make_pair(vr, n + 1)); } // Emit register declarations // @TODO: Extract out the real register usage - O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; - O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n"; + // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n"; // Emit declaration of the virtual registers or 'physical' registers for // each register class - //for (unsigned i=0; i< numRegClasses; i++) { - // std::map ®map = VRidGlobal2LocalMap[i]; - // const TargetRegisterClass *RC = TRI->getRegClass(i); - // std::string rcname = getNVPTXRegClassName(RC); - // std::string rcStr = getNVPTXRegClassStr(RC); - // //int n = regmap.size(); - // if (!isNVPTXVectorRegClass(RC)) { - // O << "\t.reg " << rcname << " \t" << rcStr << "<" - // << NVPTXNumRegisters << ">;\n"; - // } + for (unsigned i=0; i< TRI->getNumRegClasses(); i++) { + const TargetRegisterClass *RC = TRI->getRegClass(i); + DenseMap ®map = VRegMapping[RC]; + std::string rcname = getNVPTXRegClassName(RC); + std::string rcStr = getNVPTXRegClassStr(RC); + int n = regmap.size(); - // Only declare those registers that may be used. And do not emit vector - // registers as - // they are all elementized to scalar registers. - //if (n && !isNVPTXVectorRegClass(RC)) { - // if (RegAllocNilUsed) { - // O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1) - // << ">;\n"; - // } - // else { - // O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr) - // << "<" << 32 << ">;\n"; - // } - //} - //} + // Only declare those registers that may be used. + if (n) { + O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1) + << ">;\n"; + } + } OutStreamer.EmitRawText(O.str()); } diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 7faa6b265b9..55f29436672 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -243,7 +243,9 @@ private: // The contents are specific for each // MachineFunction. But the size of the // array is not. - std::map *VRidGlobal2LocalMap; + typedef DenseMap VRegMap; + typedef DenseMap VRegRCMap; + VRegRCMap VRegMapping; // cache the subtarget here. const NVPTXSubtarget &nvptxSubtarget; // Build the map between type name and ID based on module's type @@ -281,7 +283,6 @@ public: : AsmPrinter(TM, Streamer), nvptxSubtarget(TM.getSubtarget()) { CurrentBankselLabelInBasicBlock = ""; - VRidGlobal2LocalMap = NULL; reader = NULL; } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 33a63c26f4e..52be28736b0 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -32,36 +32,36 @@ NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm) void NVPTXInstrInfo::copyPhysReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { - if (NVPTX::Int32RegsRegClass.contains(DestReg) && - NVPTX::Int32RegsRegClass.contains(SrcReg)) + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + + if (DestRC != SrcRC) + report_fatal_error("Attempted to created cross-class register copy"); + + if (DestRC == &NVPTX::Int32RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int8RegsRegClass.contains(DestReg) && - NVPTX::Int8RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int1RegsRegClass.contains(DestReg) && - NVPTX::Int1RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int1RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Float32RegsRegClass.contains(DestReg) && - NVPTX::Float32RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Float32RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int16RegsRegClass.contains(DestReg) && - NVPTX::Int16RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int16RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Int64RegsRegClass.contains(DestReg) && - NVPTX::Int64RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int8RegsRegClass) + BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Int64RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::Float64RegsRegClass.contains(DestReg) && - NVPTX::Float64RegsRegClass.contains(SrcReg)) + .addReg(SrcReg, getKillRegState(KillSrc)); + else if (DestRC == &NVPTX::Float64RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + .addReg(SrcReg, getKillRegState(KillSrc)); else { - llvm_unreachable("Don't know how to copy a register"); + llvm_unreachable("Bad register copy"); } } diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp new file mode 100644 index 00000000000..843ebed5e4a --- /dev/null +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -0,0 +1,225 @@ +//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified +// to remove unneeded functionality and to handle virtual registers. Most code +// here is a copy of PrologEpilogInserter.cpp. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/Pass.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +class NVPTXPrologEpilogPass : public MachineFunctionPass { +public: + static char ID; + NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + +private: + void calculateFrameObjectOffsets(MachineFunction &Fn); +}; +} + +MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() { + return new NVPTXPrologEpilogPass(); +} + +char NVPTXPrologEpilogPass::ID = 0; + +bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const TargetRegisterInfo &TRI = *TM.getRegisterInfo(); + bool Modified = false; + + calculateFrameObjectOffsets(MF); + + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) { + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + MachineInstr *MI = I; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (!MI->getOperand(i).isFI()) + continue; + TRI.eliminateFrameIndex(MI, 0, i, NULL); + Modified = true; + } + } + } + + // Add function prolog/epilog + TFI.emitPrologue(MF); + + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + // If last instruction is a return instruction, add an epilogue + if (!I->empty() && I->back().isReturn()) + TFI.emitEpilogue(MF, *I); + } + + return Modified; +} + +/// AdjustStackOffset - Helper function used to adjust the stack frame offset. +static inline void +AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, + bool StackGrowsDown, int64_t &Offset, + unsigned &MaxAlign) { + // If the stack grows down, add the object size to find the lowest address. + if (StackGrowsDown) + Offset += MFI->getObjectSize(FrameIdx); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + + // If the alignment of this object is greater than that of the stack, then + // increase the stack alignment to match. + MaxAlign = std::max(MaxAlign, Align); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + if (StackGrowsDown) { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset + } else { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, Offset); + Offset += MFI->getObjectSize(FrameIdx); + } +} + +void +NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo(); + + bool StackGrowsDown = + TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; + + // Loop over all of the stack objects, assigning sequential addresses... + MachineFrameInfo *MFI = Fn.getFrameInfo(); + + // Start at the beginning of the local area. + // The Offset is the distance from the stack top in the direction + // of stack growth -- so it's always nonnegative. + int LocalAreaOffset = TFI.getOffsetOfLocalArea(); + if (StackGrowsDown) + LocalAreaOffset = -LocalAreaOffset; + assert(LocalAreaOffset >= 0 + && "Local area offset should be in direction of stack growth"); + int64_t Offset = LocalAreaOffset; + + // If there are fixed sized objects that are preallocated in the local area, + // non-fixed objects can't be allocated right at the start of local area. + // We currently don't support filling in holes in between fixed sized + // objects, so we adjust 'Offset' to point to the end of last fixed sized + // preallocated object. + for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) { + int64_t FixedOff; + if (StackGrowsDown) { + // The maximum distance from the stack pointer is at lower address of + // the object -- which is given by offset. For down growing stack + // the offset is negative, so we negate the offset to get the distance. + FixedOff = -MFI->getObjectOffset(i); + } else { + // The maximum distance from the start pointer is at the upper + // address of the object. + FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i); + } + if (FixedOff > Offset) Offset = FixedOff; + } + + // NOTE: We do not have a call stack + + unsigned MaxAlign = MFI->getMaxAlignment(); + + // No scavenger + + // FIXME: Once this is working, then enable flag will change to a target + // check for whether the frame is large enough to want to use virtual + // frame index registers. Functions which don't want/need this optimization + // will continue to use the existing code path. + if (MFI->getUseLocalStackAllocationBlock()) { + unsigned Align = MFI->getLocalFrameMaxAlign(); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); + + // Resolve offsets for objects in the local block. + for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) { + std::pair Entry = MFI->getLocalFrameObjectMap(i); + int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; + DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << + FIOffset << "]\n"); + MFI->setObjectOffset(Entry.first, FIOffset); + } + // Allocate the local block + Offset += MFI->getLocalFrameSize(); + + MaxAlign = std::max(Align, MaxAlign); + } + + // No stack protector + + // Then assign frame offsets to stack objects that are not used to spill + // callee saved registers. + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isObjectPreAllocated(i) && + MFI->getUseLocalStackAllocationBlock()) + continue; + if (MFI->isDeadObjectIndex(i)) + continue; + + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); + } + + // No scavenger + + if (!TFI.targetHandlesStackFrameRounding()) { + // If we have reserved argument space for call sites in the function + // immediately on entry to the current function, count it as part of the + // overall stack size. + if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn)) + Offset += MFI->getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI->adjustsStack() || MFI->hasVarSizedObjects() || + (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0)) + StackAlign = TFI.getStackAlignment(); + else + StackAlign = TFI.getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + } + + // Update frame info to pretend that this is part of the stack... + int64_t StackSize = Offset - LocalAreaOffset; + MFI->setStackSize(StackSize); +} diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index 282465359b0..bb039f83979 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -57,9 +57,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%f"; } if (RC == &NVPTX::Float64RegsRegClass) { - return "%fd"; + return "%fl"; } else if (RC == &NVPTX::Int64RegsRegClass) { - return "%rd"; + return "%rl"; } else if (RC == &NVPTX::Int32RegsRegClass) { return "%r"; } else if (RC == &NVPTX::Int16RegsRegClass) { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 5b8ea1ed998..68f9bf74f7f 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -107,6 +107,10 @@ public: virtual void addIRPasses(); virtual bool addInstSelector(); virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + + virtual void addFastRegAlloc(FunctionPass *RegAllocPass); + virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass); }; } // end anonymous namespace @@ -116,6 +120,15 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { } void NVPTXPassConfig::addIRPasses() { + // The following passes are known to not play well with virtual regs hanging + // around after register allocation (which in our case, is *all* registers). + // We explicitly disable them here. We do, however, need some functionality + // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the + // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). + disablePass(&PrologEpilogCodeInserterID); + disablePass(&MachineCopyPropagationID); + disablePass(&BranchFolderPassID); + TargetPassConfig::addIRPasses(); addPass(createGenericToNVVMPass()); } @@ -129,3 +142,17 @@ bool NVPTXPassConfig::addInstSelector() { } bool NVPTXPassConfig::addPreRegAlloc() { return false; } +bool NVPTXPassConfig::addPostRegAlloc() { + addPass(createNVPTXPrologEpilogPass()); + return false; +} + +void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + // No reg alloc + addPass(&StrongPHIEliminationID); +} + +void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // No reg alloc + addPass(&StrongPHIEliminationID); +} diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll index 53a28f33379..af91bb44241 100644 --- a/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/test/CodeGen/NVPTX/intrinsic-old.ll @@ -2,231 +2,231 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s define ptx_device i32 @test_tid_x() { -; CHECK: mov.u32 %r0, %tid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.x() ret i32 %x } define ptx_device i32 @test_tid_y() { -; CHECK: mov.u32 %r0, %tid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.y() ret i32 %x } define ptx_device i32 @test_tid_z() { -; CHECK: mov.u32 %r0, %tid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.z() ret i32 %x } define ptx_device i32 @test_tid_w() { -; CHECK: mov.u32 %r0, %tid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.w() ret i32 %x } define ptx_device i32 @test_ntid_x() { -; CHECK: mov.u32 %r0, %ntid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.x() ret i32 %x } define ptx_device i32 @test_ntid_y() { -; CHECK: mov.u32 %r0, %ntid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.y() ret i32 %x } define ptx_device i32 @test_ntid_z() { -; CHECK: mov.u32 %r0, %ntid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.z() ret i32 %x } define ptx_device i32 @test_ntid_w() { -; CHECK: mov.u32 %r0, %ntid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.w() ret i32 %x } define ptx_device i32 @test_laneid() { -; CHECK: mov.u32 %r0, %laneid; +; CHECK: mov.u32 %r{{[0-9]+}}, %laneid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.laneid() ret i32 %x } define ptx_device i32 @test_warpid() { -; CHECK: mov.u32 %r0, %warpid; +; CHECK: mov.u32 %r{{[0-9]+}}, %warpid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.warpid() ret i32 %x } define ptx_device i32 @test_nwarpid() { -; CHECK: mov.u32 %r0, %nwarpid; +; CHECK: mov.u32 %r{{[0-9]+}}, %nwarpid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nwarpid() ret i32 %x } define ptx_device i32 @test_ctaid_x() { -; CHECK: mov.u32 %r0, %ctaid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.x() ret i32 %x } define ptx_device i32 @test_ctaid_y() { -; CHECK: mov.u32 %r0, %ctaid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.y() ret i32 %x } define ptx_device i32 @test_ctaid_z() { -; CHECK: mov.u32 %r0, %ctaid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.z() ret i32 %x } define ptx_device i32 @test_ctaid_w() { -; CHECK: mov.u32 %r0, %ctaid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.w() ret i32 %x } define ptx_device i32 @test_nctaid_x() { -; CHECK: mov.u32 %r0, %nctaid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.x() ret i32 %x } define ptx_device i32 @test_nctaid_y() { -; CHECK: mov.u32 %r0, %nctaid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.y() ret i32 %x } define ptx_device i32 @test_nctaid_z() { -; CHECK: mov.u32 %r0, %nctaid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.z() ret i32 %x } define ptx_device i32 @test_nctaid_w() { -; CHECK: mov.u32 %r0, %nctaid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.w() ret i32 %x } define ptx_device i32 @test_smid() { -; CHECK: mov.u32 %r0, %smid; +; CHECK: mov.u32 %r{{[0-9]+}}, %smid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.smid() ret i32 %x } define ptx_device i32 @test_nsmid() { -; CHECK: mov.u32 %r0, %nsmid; +; CHECK: mov.u32 %r{{[0-9]+}}, %nsmid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nsmid() ret i32 %x } define ptx_device i32 @test_gridid() { -; CHECK: mov.u32 %r0, %gridid; +; CHECK: mov.u32 %r{{[0-9]+}}, %gridid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.gridid() ret i32 %x } define ptx_device i32 @test_lanemask_eq() { -; CHECK: mov.u32 %r0, %lanemask_eq; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_eq; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.eq() ret i32 %x } define ptx_device i32 @test_lanemask_le() { -; CHECK: mov.u32 %r0, %lanemask_le; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_le; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.le() ret i32 %x } define ptx_device i32 @test_lanemask_lt() { -; CHECK: mov.u32 %r0, %lanemask_lt; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_lt; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.lt() ret i32 %x } define ptx_device i32 @test_lanemask_ge() { -; CHECK: mov.u32 %r0, %lanemask_ge; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_ge; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.ge() ret i32 %x } define ptx_device i32 @test_lanemask_gt() { -; CHECK: mov.u32 %r0, %lanemask_gt; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_gt; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.gt() ret i32 %x } define ptx_device i32 @test_clock() { -; CHECK: mov.u32 %r0, %clock; +; CHECK: mov.u32 %r{{[0-9]+}}, %clock; ; CHECK: ret; %x = call i32 @llvm.ptx.read.clock() ret i32 %x } define ptx_device i64 @test_clock64() { -; CHECK: mov.u64 %rl0, %clock64; +; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64; ; CHECK: ret; %x = call i64 @llvm.ptx.read.clock64() ret i64 %x } define ptx_device i32 @test_pm0() { -; CHECK: mov.u32 %r0, %pm0; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm0; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm0() ret i32 %x } define ptx_device i32 @test_pm1() { -; CHECK: mov.u32 %r0, %pm1; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm1; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm1() ret i32 %x } define ptx_device i32 @test_pm2() { -; CHECK: mov.u32 %r0, %pm2; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm2; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm2() ret i32 %x } define ptx_device i32 @test_pm3() { -; CHECK: mov.u32 %r0, %pm3; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm3; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm3() ret i32 %x diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll index 1676f20643d..78e1e778901 100644 --- a/test/CodeGen/NVPTX/intrinsics.ll +++ b/test/CodeGen/NVPTX/intrinsics.ll @@ -2,14 +2,14 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s define ptx_device float @test_fabsf(float %f) { -; CHECK: abs.f32 %f0, %f0; +; CHECK: abs.f32 %f{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %x = call float @llvm.fabs.f32(float %f) ret float %x } define ptx_device double @test_fabs(double %d) { -; CHECK: abs.f64 %fl0, %fl0; +; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}; ; CHECK: ret; %x = call double @llvm.fabs.f64(double %d) ret double %x