llvm-6502/lib/Target/PowerPC/PPCFastISel.cpp

2352 lines
81 KiB
C++
Raw Normal View History

//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the PowerPC-specific support for the FastISel class. Some
// of the target-specific code is generated by tablegen in the file
// PPCGenFastISel.inc, which is #included here.
//
//===----------------------------------------------------------------------===//
#include "PPC.h"
#include "MCTargetDesc/PPCPredicates.h"
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
#include "PPCCallingConv.h"
#include "PPCISelLowering.h"
#include "PPCMachineFunctionInfo.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetMachine.h"
//===----------------------------------------------------------------------===//
//
// TBD:
// fastLowerArguments: Handle simple cases.
// PPCMaterializeGV: Handle TLS.
// SelectCall: Handle function pointers.
// SelectCall: Handle multi-register return values.
// SelectCall: Optimize away nops for local calls.
// processCallArgs: Handle bit-converted arguments.
// finishCall: Handle multi-register return values.
// PPCComputeAddress: Handle parameter references as FrameIndex's.
// PPCEmitCmp: Handle immediate as operand 1.
// SelectCall: Handle small byval arguments.
// SelectIntrinsicCall: Implement.
// SelectSelect: Implement.
// Consider factoring isTypeLegal into the base class.
// Implement switches and jump tables.
//
//===----------------------------------------------------------------------===//
using namespace llvm;
#define DEBUG_TYPE "ppcfastisel"
namespace {
typedef struct Address {
enum {
RegBase,
FrameIndexBase
} BaseType;
union {
unsigned Reg;
int FI;
} Base;
long Offset;
// Innocuous defaults for our address.
Address()
: BaseType(RegBase), Offset(0) {
Base.Reg = 0;
}
} Address;
class PPCFastISel final : public FastISel {
const TargetMachine &TM;
const PPCSubtarget *PPCSubTarget;
PPCFunctionInfo *PPCFuncInfo;
const TargetInstrInfo &TII;
const TargetLowering &TLI;
LLVMContext *Context;
public:
explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo)
: FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
TII(*PPCSubTarget->getInstrInfo()),
TLI(*PPCSubTarget->getTargetLowering()),
Context(&FuncInfo.Fn->getContext()) {}
// Backend specific FastISel code.
private:
bool fastSelectInstruction(const Instruction *I) override;
unsigned fastMaterializeConstant(const Constant *C) override;
unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
const LoadInst *LI) override;
bool fastLowerArguments() override;
unsigned fastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
unsigned Op0, bool Op0IsKill,
uint64_t Imm);
unsigned fastEmitInst_r(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
unsigned Op0, bool Op0IsKill);
unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
unsigned Op0, bool Op0IsKill,
unsigned Op1, bool Op1IsKill);
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
bool fastLowerCall(CallLoweringInfo &CLI) override;
// Instruction selection routines.
private:
bool SelectLoad(const Instruction *I);
bool SelectStore(const Instruction *I);
bool SelectBranch(const Instruction *I);
bool SelectIndirectBr(const Instruction *I);
bool SelectFPExt(const Instruction *I);
bool SelectFPTrunc(const Instruction *I);
bool SelectIToFP(const Instruction *I, bool IsSigned);
bool SelectFPToI(const Instruction *I, bool IsSigned);
bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
bool SelectRet(const Instruction *I);
bool SelectTrunc(const Instruction *I);
bool SelectIntExt(const Instruction *I);
// Utility routines.
private:
bool isTypeLegal(Type *Ty, MVT &VT);
bool isLoadTypeLegal(Type *Ty, MVT &VT);
bool isValueAvailable(const Value *V) const;
bool isVSFRCRegister(unsigned Register) const {
return MRI.getRegClass(Register)->getID() == PPC::VSFRCRegClassID;
}
bool isVSSRCRegister(unsigned Register) const {
return MRI.getRegClass(Register)->getID() == PPC::VSSRCRegClassID;
}
bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
bool isZExt, unsigned DestReg);
bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
const TargetRegisterClass *RC, bool IsZExt = true,
unsigned FP64LoadOpc = PPC::LFD);
bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
bool PPCComputeAddress(const Value *Obj, Address &Addr);
void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
unsigned &IndexReg);
bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
unsigned DestReg, bool IsZExt);
unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
unsigned PPCMaterializeInt(const Constant *C, MVT VT, bool UseSExt = true);
unsigned PPCMaterialize32BitInt(int64_t Imm,
const TargetRegisterClass *RC);
unsigned PPCMaterialize64BitInt(int64_t Imm,
const TargetRegisterClass *RC);
unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
unsigned SrcReg, bool IsSigned);
unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
// Call handling routines.
private:
bool processCallArgs(SmallVectorImpl<Value*> &Args,
SmallVectorImpl<unsigned> &ArgRegs,
SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
SmallVectorImpl<unsigned> &RegArgs,
CallingConv::ID CC,
unsigned &NumBytes,
bool IsVarArg);
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes);
CCAssignFn *usePPC32CCs(unsigned Flag);
private:
#include "PPCGenFastISel.inc"
};
} // end anonymous namespace
#include "PPCGenCallingConv.inc"
// Function whose sole purpose is to kill compiler warnings
// stemming from unused functions included from PPCGenCallingConv.inc.
CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
if (Flag == 1)
return CC_PPC32_SVR4;
else if (Flag == 2)
return CC_PPC32_SVR4_ByVal;
else if (Flag == 3)
return CC_PPC32_SVR4_VarArg;
else
return RetCC_PPC;
}
static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
switch (Pred) {
// These are not representable with any single compare.
case CmpInst::FCMP_FALSE:
case CmpInst::FCMP_UEQ:
case CmpInst::FCMP_UGT:
case CmpInst::FCMP_UGE:
case CmpInst::FCMP_ULT:
case CmpInst::FCMP_ULE:
case CmpInst::FCMP_UNE:
case CmpInst::FCMP_TRUE:
default:
return Optional<PPC::Predicate>();
case CmpInst::FCMP_OEQ:
case CmpInst::ICMP_EQ:
return PPC::PRED_EQ;
case CmpInst::FCMP_OGT:
case CmpInst::ICMP_UGT:
case CmpInst::ICMP_SGT:
return PPC::PRED_GT;
case CmpInst::FCMP_OGE:
case CmpInst::ICMP_UGE:
case CmpInst::ICMP_SGE:
return PPC::PRED_GE;
case CmpInst::FCMP_OLT:
case CmpInst::ICMP_ULT:
case CmpInst::ICMP_SLT:
return PPC::PRED_LT;
case CmpInst::FCMP_OLE:
case CmpInst::ICMP_ULE:
case CmpInst::ICMP_SLE:
return PPC::PRED_LE;
case CmpInst::FCMP_ONE:
case CmpInst::ICMP_NE:
return PPC::PRED_NE;
case CmpInst::FCMP_ORD:
return PPC::PRED_NU;
case CmpInst::FCMP_UNO:
return PPC::PRED_UN;
}
}
// Determine whether the type Ty is simple enough to be handled by
// fast-isel, and return its equivalent machine type in VT.
// FIXME: Copied directly from ARM -- factor into base class?
bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
EVT Evt = TLI.getValueType(DL, Ty, true);
// Only handle simple types.
if (Evt == MVT::Other || !Evt.isSimple()) return false;
VT = Evt.getSimpleVT();
// Handle all legal types, i.e. a register that will directly hold this
// value.
return TLI.isTypeLegal(VT);
}
// Determine whether the type Ty is simple enough to be handled by
// fast-isel as a load target, and return its equivalent machine type in VT.
bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
if (isTypeLegal(Ty, VT)) return true;
// If this is a type than can be sign or zero-extended to a basic operation
// go ahead and accept it now.
if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
return true;
}
return false;
}
bool PPCFastISel::isValueAvailable(const Value *V) const {
if (!isa<Instruction>(V))
return true;
const auto *I = cast<Instruction>(V);
if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
return true;
return false;
}
// Given a value Obj, create an Address object Addr that represents its
// address. Return false if we can't handle it.
bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
// Don't walk into other basic blocks unless the object is an alloca from
// another block, otherwise it may not have a virtual register assigned.
if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
Opcode = I->getOpcode();
U = I;
}
} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
Opcode = C->getOpcode();
U = C;
}
switch (Opcode) {
default:
break;
case Instruction::BitCast:
// Look through bitcasts.
return PPCComputeAddress(U->getOperand(0), Addr);
case Instruction::IntToPtr:
// Look past no-op inttoptrs.
if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
TLI.getPointerTy(DL))
return PPCComputeAddress(U->getOperand(0), Addr);
break;
case Instruction::PtrToInt:
// Look past no-op ptrtoints.
if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
return PPCComputeAddress(U->getOperand(0), Addr);
break;
case Instruction::GetElementPtr: {
Address SavedAddr = Addr;
long TmpOffset = Addr.Offset;
// Iterate through the GEP folding the constants into offsets where
// we can.
gep_type_iterator GTI = gep_type_begin(U);
for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
II != IE; ++II, ++GTI) {
const Value *Op = *II;
if (StructType *STy = dyn_cast<StructType>(*GTI)) {
const StructLayout *SL = DL.getStructLayout(STy);
unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
TmpOffset += SL->getElementOffset(Idx);
} else {
uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
for (;;) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
TmpOffset += CI->getSExtValue() * S;
break;
}
if (canFoldAddIntoGEP(U, Op)) {
// A compatible add with a constant operand. Fold the constant.
ConstantInt *CI =
cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
TmpOffset += CI->getSExtValue() * S;
// Iterate on the other operand.
Op = cast<AddOperator>(Op)->getOperand(0);
continue;
}
// Unsupported
goto unsupported_gep;
}
}
}
// Try to grab the base operand now.
Addr.Offset = TmpOffset;
if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
// We failed, restore everything and try the other options.
Addr = SavedAddr;
unsupported_gep:
break;
}
case Instruction::Alloca: {
const AllocaInst *AI = cast<AllocaInst>(Obj);
DenseMap<const AllocaInst*, int>::iterator SI =
FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end()) {
Addr.BaseType = Address::FrameIndexBase;
Addr.Base.FI = SI->second;
return true;
}
break;
}
}
// FIXME: References to parameters fall through to the behavior
// below. They should be able to reference a frame index since
// they are stored to the stack, so we can get "ld rx, offset(r1)"
// instead of "addi ry, r1, offset / ld rx, 0(ry)". Obj will
// just contain the parameter. Try to handle this with a FI.
// Try to get this in a register if nothing else has worked.
if (Addr.Base.Reg == 0)
Addr.Base.Reg = getRegForValue(Obj);
// Prevent assignment of base register to X0, which is inappropriate
// for loads and stores alike.
if (Addr.Base.Reg != 0)
MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
return Addr.Base.Reg != 0;
}
// Fix up some addresses that can't be used directly. For example, if
// an offset won't fit in an instruction field, we may need to move it
// into an index register.
void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
unsigned &IndexReg) {
// Check whether the offset fits in the instruction field.
if (!isInt<16>(Addr.Offset))
UseOffset = false;
// If this is a stack pointer and the offset needs to be simplified then
// put the alloca address into a register, set the base type back to
// register and continue. This should almost never happen.
if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
Addr.Base.Reg = ResultReg;
Addr.BaseType = Address::RegBase;
}
if (!UseOffset) {
IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
: Type::getInt64Ty(*Context));
const ConstantInt *Offset =
ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
IndexReg = PPCMaterializeInt(Offset, MVT::i64);
assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
}
}
// Emit a load instruction if possible, returning true if we succeeded,
// otherwise false. See commentary below for how the register class of
// the load is determined.
bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
const TargetRegisterClass *RC,
bool IsZExt, unsigned FP64LoadOpc) {
unsigned Opc;
bool UseOffset = true;
// If ResultReg is given, it determines the register class of the load.
// Otherwise, RC is the register class to use. If the result of the
// load isn't anticipated in this block, both may be zero, in which
// case we must make a conservative guess. In particular, don't assign
// R0 or X0 to the result register, as the result may be used in a load,
// store, add-immediate, or isel that won't permit this. (Though
// perhaps the spill and reload of live-exit values would handle this?)
const TargetRegisterClass *UseRC =
(ResultReg ? MRI.getRegClass(ResultReg) :
(RC ? RC :
(VT == MVT::f64 ? &PPC::F8RCRegClass :
(VT == MVT::f32 ? &PPC::F4RCRegClass :
(VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
&PPC::GPRC_and_GPRC_NOR0RegClass)))));
bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
switch (VT.SimpleTy) {
default: // e.g., vector types not handled
return false;
case MVT::i8:
Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
break;
case MVT::i16:
Opc = (IsZExt ?
(Is32BitInt ? PPC::LHZ : PPC::LHZ8) :
(Is32BitInt ? PPC::LHA : PPC::LHA8));
break;
case MVT::i32:
Opc = (IsZExt ?
(Is32BitInt ? PPC::LWZ : PPC::LWZ8) :
(Is32BitInt ? PPC::LWA_32 : PPC::LWA));
if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
UseOffset = false;
break;
case MVT::i64:
Opc = PPC::LD;
assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) &&
"64-bit load with 32-bit target??");
UseOffset = ((Addr.Offset & 3) == 0);
break;
case MVT::f32:
Opc = PPC::LFS;
break;
case MVT::f64:
Opc = FP64LoadOpc;
break;
}
// If necessary, materialize the offset into a register and use
// the indexed form. Also handle stack pointers with special needs.
unsigned IndexReg = 0;
PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
// If this is a potential VSX load with an offset of 0, a VSX indexed load can
// be used.
bool IsVSSRC = (ResultReg != 0) && isVSSRCRegister(ResultReg);
bool IsVSFRC = (ResultReg != 0) && isVSFRCRegister(ResultReg);
bool Is32VSXLoad = IsVSSRC && Opc == PPC::LFS;
bool Is64VSXLoad = IsVSSRC && Opc == PPC::LFD;
if ((Is32VSXLoad || Is64VSXLoad) &&
(Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
(Addr.Offset == 0)) {
UseOffset = false;
}
if (ResultReg == 0)
ResultReg = createResultReg(UseRC);
// Note: If we still have a frame index here, we know the offset is
// in range, as otherwise PPCSimplifyAddress would have converted it
// into a RegBase.
if (Addr.BaseType == Address::FrameIndexBase) {
// VSX only provides an indexed load.
if (Is32VSXLoad || Is64VSXLoad) return false;
MachineMemOperand *MMO =
FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
MFI.getObjectAlignment(Addr.Base.FI));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
// Base reg with offset in range.
} else if (UseOffset) {
// VSX only provides an indexed load.
if (Is32VSXLoad || Is64VSXLoad) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addImm(Addr.Offset).addReg(Addr.Base.Reg);
// Indexed form.
} else {
// Get the RR opcode corresponding to the RI one. FIXME: It would be
// preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
// is hard to get at.
switch (Opc) {
default: llvm_unreachable("Unexpected opcode!");
case PPC::LBZ: Opc = PPC::LBZX; break;
case PPC::LBZ8: Opc = PPC::LBZX8; break;
case PPC::LHZ: Opc = PPC::LHZX; break;
case PPC::LHZ8: Opc = PPC::LHZX8; break;
case PPC::LHA: Opc = PPC::LHAX; break;
case PPC::LHA8: Opc = PPC::LHAX8; break;
case PPC::LWZ: Opc = PPC::LWZX; break;
case PPC::LWZ8: Opc = PPC::LWZX8; break;
case PPC::LWA: Opc = PPC::LWAX; break;
case PPC::LWA_32: Opc = PPC::LWAX_32; break;
case PPC::LD: Opc = PPC::LDX; break;
case PPC::LFS: Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
case PPC::LFD: Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(Addr.Base.Reg).addReg(IndexReg);
}
return true;
}
// Attempt to fast-select a load instruction.
bool PPCFastISel::SelectLoad(const Instruction *I) {
// FIXME: No atomic loads are supported.
if (cast<LoadInst>(I)->isAtomic())
return false;
// Verify we have a legal type before going any further.
MVT VT;
if (!isLoadTypeLegal(I->getType(), VT))
return false;
// See if we can handle this address.
Address Addr;
if (!PPCComputeAddress(I->getOperand(0), Addr))
return false;
// Look at the currently assigned register for this instruction
// to determine the required register class. This is necessary
// to constrain RA from using R0/X0 when this is not legal.
unsigned AssignedReg = FuncInfo.ValueMap[I];
const TargetRegisterClass *RC =
AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
unsigned ResultReg = 0;
if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
return false;
updateValueMap(I, ResultReg);
return true;
}
// Emit a store instruction to store SrcReg at Addr.
bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
assert(SrcReg && "Nothing to store!");
unsigned Opc;
bool UseOffset = true;
const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
switch (VT.SimpleTy) {
default: // e.g., vector types not handled
return false;
case MVT::i8:
Opc = Is32BitInt ? PPC::STB : PPC::STB8;
break;
case MVT::i16:
Opc = Is32BitInt ? PPC::STH : PPC::STH8;
break;
case MVT::i32:
assert(Is32BitInt && "Not GPRC for i32??");
Opc = PPC::STW;
break;
case MVT::i64:
Opc = PPC::STD;
UseOffset = ((Addr.Offset & 3) == 0);
break;
case MVT::f32:
Opc = PPC::STFS;
break;
case MVT::f64:
Opc = PPC::STFD;
break;
}
// If necessary, materialize the offset into a register and use
// the indexed form. Also handle stack pointers with special needs.
unsigned IndexReg = 0;
PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
// If this is a potential VSX store with an offset of 0, a VSX indexed store
// can be used.
bool IsVSSRC = isVSSRCRegister(SrcReg);
bool IsVSFRC = isVSFRCRegister(SrcReg);
bool Is32VSXStore = IsVSSRC && Opc == PPC::STFS;
bool Is64VSXStore = IsVSFRC && Opc == PPC::STFD;
if ((Is32VSXStore || Is64VSXStore) &&
(Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
(Addr.Offset == 0)) {
UseOffset = false;
}
// Note: If we still have a frame index here, we know the offset is
// in range, as otherwise PPCSimplifyAddress would have converted it
// into a RegBase.
if (Addr.BaseType == Address::FrameIndexBase) {
// VSX only provides an indexed store.
if (Is32VSXStore || Is64VSXStore) return false;
MachineMemOperand *MMO =
FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
MFI.getObjectAlignment(Addr.Base.FI));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
.addReg(SrcReg)
.addImm(Addr.Offset)
.addFrameIndex(Addr.Base.FI)
.addMemOperand(MMO);
// Base reg with offset in range.
} else if (UseOffset) {
// VSX only provides an indexed store.
if (Is32VSXStore || Is64VSXStore) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
.addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
// Indexed form.
} else {
// Get the RR opcode corresponding to the RI one. FIXME: It would be
// preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
// is hard to get at.
switch (Opc) {
default: llvm_unreachable("Unexpected opcode!");
case PPC::STB: Opc = PPC::STBX; break;
case PPC::STH : Opc = PPC::STHX; break;
case PPC::STW : Opc = PPC::STWX; break;
case PPC::STB8: Opc = PPC::STBX8; break;
case PPC::STH8: Opc = PPC::STHX8; break;
case PPC::STW8: Opc = PPC::STWX8; break;
case PPC::STD: Opc = PPC::STDX; break;
case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
}
auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
.addReg(SrcReg);
// If we have an index register defined we use it in the store inst,
// otherwise we use X0 as base as it makes the vector instructions to
// use zero in the computation of the effective address regardless the
// content of the register.
if (IndexReg)
MIB.addReg(Addr.Base.Reg).addReg(IndexReg);
else
MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg);
}
return true;
}
// Attempt to fast-select a store instruction.
bool PPCFastISel::SelectStore(const Instruction *I) {
Value *Op0 = I->getOperand(0);
unsigned SrcReg = 0;
// FIXME: No atomics loads are supported.
if (cast<StoreInst>(I)->isAtomic())
return false;
// Verify we have a legal type before going any further.
MVT VT;
if (!isLoadTypeLegal(Op0->getType(), VT))
return false;
// Get the value to be stored into a register.
SrcReg = getRegForValue(Op0);
if (SrcReg == 0)
return false;
// See if we can handle this address.
Address Addr;
if (!PPCComputeAddress(I->getOperand(1), Addr))
return false;
if (!PPCEmitStore(VT, SrcReg, Addr))
return false;
return true;
}
// Attempt to fast-select a branch instruction.
bool PPCFastISel::SelectBranch(const Instruction *I) {
const BranchInst *BI = cast<BranchInst>(I);
MachineBasicBlock *BrBB = FuncInfo.MBB;
MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
// For now, just try the simplest case where it's fed by a compare.
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
if (isValueAvailable(CI)) {
Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
if (!OptPPCPred)
return false;
PPC::Predicate PPCPred = OptPPCPred.getValue();
// Take advantage of fall-through opportunities.
if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
std::swap(TBB, FBB);
PPCPred = PPC::InvertPredicate(PPCPred);
}
unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
CondReg))
return false;
BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
.addImm(PPCPred).addReg(CondReg).addMBB(TBB);
fastEmitBranch(FBB, DbgLoc);
FuncInfo.MBB->addSuccessor(TBB);
return true;
}
} else if (const ConstantInt *CI =
dyn_cast<ConstantInt>(BI->getCondition())) {
uint64_t Imm = CI->getZExtValue();
MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
fastEmitBranch(Target, DbgLoc);
return true;
}
// FIXME: ARM looks for a case where the block containing the compare
// has been split from the block containing the branch. If this happens,
// there is a vreg available containing the result of the compare. I'm
// not sure we can do much, as we've lost the predicate information with
// the compare instruction -- we have a 4-bit CR but don't know which bit
// to test here.
return false;
}
// Attempt to emit a compare of the two source values. Signed and unsigned
// comparisons are supported. Return false if we can't handle it.
bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
bool IsZExt, unsigned DestReg) {
Type *Ty = SrcValue1->getType();
EVT SrcEVT = TLI.getValueType(DL, Ty, true);
if (!SrcEVT.isSimple())
return false;
MVT SrcVT = SrcEVT.getSimpleVT();
if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
return false;
// See if operand 2 is an immediate encodeable in the compare.
// FIXME: Operands are not in canonical order at -O0, so an immediate
// operand in position 1 is a lost opportunity for now. We are
// similar to ARM in this regard.
long Imm = 0;
bool UseImm = false;
// Only 16-bit integer constants can be represented in compares for
// PowerPC. Others will be materialized into a register.
if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
SrcVT == MVT::i8 || SrcVT == MVT::i1) {
const APInt &CIVal = ConstInt->getValue();
Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
UseImm = true;
}
}
unsigned CmpOpc;
bool NeedsExt = false;
switch (SrcVT.SimpleTy) {
default: return false;
case MVT::f32:
CmpOpc = PPC::FCMPUS;
break;
case MVT::f64:
CmpOpc = PPC::FCMPUD;
break;
case MVT::i1:
case MVT::i8:
case MVT::i16:
NeedsExt = true;
// Intentional fall-through.
case MVT::i32:
if (!UseImm)
CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
else
CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
break;
case MVT::i64:
if (!UseImm)
CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
else
CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
break;
}
unsigned SrcReg1 = getRegForValue(SrcValue1);
if (SrcReg1 == 0)
return false;
unsigned SrcReg2 = 0;
if (!UseImm) {
SrcReg2 = getRegForValue(SrcValue2);
if (SrcReg2 == 0)
return false;
}
if (NeedsExt) {
unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
return false;
SrcReg1 = ExtReg;
if (!UseImm) {
unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
return false;
SrcReg2 = ExtReg;
}
}
if (!UseImm)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
.addReg(SrcReg1).addReg(SrcReg2);
else
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
.addReg(SrcReg1).addImm(Imm);
return true;
}
// Attempt to fast-select a floating-point extend instruction.
bool PPCFastISel::SelectFPExt(const Instruction *I) {
Value *Src = I->getOperand(0);
EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
EVT DestVT = TLI.getValueType(DL, I->getType(), true);
if (SrcVT != MVT::f32 || DestVT != MVT::f64)
return false;
unsigned SrcReg = getRegForValue(Src);
if (!SrcReg)
return false;
// No code is generated for a FP extend.
updateValueMap(I, SrcReg);
return true;
}
// Attempt to fast-select a floating-point truncate instruction.
bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
Value *Src = I->getOperand(0);
EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
EVT DestVT = TLI.getValueType(DL, I->getType(), true);
if (SrcVT != MVT::f64 || DestVT != MVT::f32)
return false;
unsigned SrcReg = getRegForValue(Src);
if (!SrcReg)
return false;
// Round the result to single precision.
unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
.addReg(SrcReg);
updateValueMap(I, DestReg);
return true;
}
// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
// FIXME: When direct register moves are implemented (see PowerISA 2.07),
// those should be used instead of moving via a stack slot when the
// subtarget permits.
// FIXME: The code here is sloppy for the 4-byte case. Can use a 4-byte
// stack slot and 4-byte store/load sequence. Or just sext the 4-byte
// case to 8 bytes which produces tighter code but wastes stack space.
unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
bool IsSigned) {
// If necessary, extend 32-bit int to 64-bit.
if (SrcVT == MVT::i32) {
unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
return 0;
SrcReg = TmpReg;
}
// Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
Address Addr;
Addr.BaseType = Address::FrameIndexBase;
Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
// Store the value from the GPR.
if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
return 0;
// Load the integer value into an FPR. The kind of load used depends
// on a number of conditions.
unsigned LoadOpc = PPC::LFD;
if (SrcVT == MVT::i32) {
if (!IsSigned) {
LoadOpc = PPC::LFIWZX;
Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
} else if (PPCSubTarget->hasLFIWAX()) {
LoadOpc = PPC::LFIWAX;
Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
}
}
const TargetRegisterClass *RC = &PPC::F8RCRegClass;
unsigned ResultReg = 0;
if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
return 0;
return ResultReg;
}
// Attempt to fast-select an integer-to-floating-point conversion.
// FIXME: Once fast-isel has better support for VSX, conversions using
// direct moves should be implemented.
bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
MVT DstVT;
Type *DstTy = I->getType();
if (!isTypeLegal(DstTy, DstVT))
return false;
if (DstVT != MVT::f32 && DstVT != MVT::f64)
return false;
Value *Src = I->getOperand(0);
EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
if (!SrcEVT.isSimple())
return false;
MVT SrcVT = SrcEVT.getSimpleVT();
if (SrcVT != MVT::i8 && SrcVT != MVT::i16 &&
SrcVT != MVT::i32 && SrcVT != MVT::i64)
return false;
unsigned SrcReg = getRegForValue(Src);
if (SrcReg == 0)
return false;
// We can only lower an unsigned convert if we have the newer
// floating-point conversion operations.
if (!IsSigned && !PPCSubTarget->hasFPCVT())
return false;
// FIXME: For now we require the newer floating-point conversion operations
// (which are present only on P7 and A2 server models) when converting
// to single-precision float. Otherwise we have to generate a lot of
// fiddly code to avoid double rounding. If necessary, the fiddly code
// can be found in PPCTargetLowering::LowerINT_TO_FP().
if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
return false;
// Extend the input if necessary.
if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
return false;
SrcVT = MVT::i64;
SrcReg = TmpReg;
}
// Move the integer value to an FPR.
unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
if (FPReg == 0)
return false;
// Determine the opcode for the conversion.
const TargetRegisterClass *RC = &PPC::F8RCRegClass;
unsigned DestReg = createResultReg(RC);
unsigned Opc;
if (DstVT == MVT::f32)
Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
else
Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
// Generate the convert.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addReg(FPReg);
updateValueMap(I, DestReg);
return true;
}
// Move the floating-point value in SrcReg into an integer destination
// register, and return the register (or zero if we can't handle it).
// FIXME: When direct register moves are implemented (see PowerISA 2.07),
// those should be used instead of moving via a stack slot when the
// subtarget permits.
unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
unsigned SrcReg, bool IsSigned) {
// Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
// Note that if have STFIWX available, we could use a 4-byte stack
// slot for i32, but this being fast-isel we'll just go with the
// easiest code gen possible.
Address Addr;
Addr.BaseType = Address::FrameIndexBase;
Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
// Store the value from the FPR.
if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
return 0;
// Reload it into a GPR. If we want an i32, modify the address
// to have a 4-byte offset so we load from the right place.
if (VT == MVT::i32)
Addr.Offset = 4;
// Look at the currently assigned register for this instruction
// to determine the required register class.
unsigned AssignedReg = FuncInfo.ValueMap[I];
const TargetRegisterClass *RC =
AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
unsigned ResultReg = 0;
if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
return 0;
return ResultReg;
}
// Attempt to fast-select a floating-point-to-integer conversion.
// FIXME: Once fast-isel has better support for VSX, conversions using
// direct moves should be implemented.
bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
MVT DstVT, SrcVT;
Type *DstTy = I->getType();
if (!isTypeLegal(DstTy, DstVT))
return false;
if (DstVT != MVT::i32 && DstVT != MVT::i64)
return false;
// If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
return false;
Value *Src = I->getOperand(0);
Type *SrcTy = Src->getType();
if (!isTypeLegal(SrcTy, SrcVT))
return false;
if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
return false;
unsigned SrcReg = getRegForValue(Src);
if (SrcReg == 0)
return false;
// Convert f32 to f64 if necessary. This is just a meaningless copy
// to get the register class right. COPY_TO_REGCLASS is needed since
// a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream.
const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
if (InRC == &PPC::F4RCRegClass) {
unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
.addReg(SrcReg).addImm(PPC::F8RCRegClassID);
SrcReg = TmpReg;
}
// Determine the opcode for the conversion, which takes place
// entirely within FPRs.
unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
unsigned Opc;
if (DstVT == MVT::i32)
if (IsSigned)
Opc = PPC::FCTIWZ;
else
Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
else
Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
// Generate the convert.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addReg(SrcReg);
// Now move the integer value from a float register to an integer register.
unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
if (IntReg == 0)
return false;
updateValueMap(I, IntReg);
return true;
}
// Attempt to fast-select a binary integer operation that isn't already
// handled automatically.
bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
EVT DestVT = TLI.getValueType(DL, I->getType(), true);
// We can get here in the case when we have a binary operation on a non-legal
// type and the target independent selector doesn't know how to handle it.
if (DestVT != MVT::i16 && DestVT != MVT::i8)
return false;
// Look at the currently assigned register for this instruction
// to determine the required register class. If there is no register,
// make a conservative choice (don't assign R0).
unsigned AssignedReg = FuncInfo.ValueMap[I];
const TargetRegisterClass *RC =
(AssignedReg ? MRI.getRegClass(AssignedReg) :
&PPC::GPRC_and_GPRC_NOR0RegClass);
bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
unsigned Opc;
switch (ISDOpcode) {
default: return false;
case ISD::ADD:
Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
break;
case ISD::OR:
Opc = IsGPRC ? PPC::OR : PPC::OR8;
break;
case ISD::SUB:
Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
break;
}
unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
unsigned SrcReg1 = getRegForValue(I->getOperand(0));
if (SrcReg1 == 0) return false;
// Handle case of small immediate operand.
if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
const APInt &CIVal = ConstInt->getValue();
int Imm = (int)CIVal.getSExtValue();
bool UseImm = true;
if (isInt<16>(Imm)) {
switch (Opc) {
default:
llvm_unreachable("Missing case!");
case PPC::ADD4:
Opc = PPC::ADDI;
MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
break;
case PPC::ADD8:
Opc = PPC::ADDI8;
MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
break;
case PPC::OR:
Opc = PPC::ORI;
break;
case PPC::OR8:
Opc = PPC::ORI8;
break;
case PPC::SUBF:
if (Imm == -32768)
UseImm = false;
else {
Opc = PPC::ADDI;
MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
Imm = -Imm;
}
break;
case PPC::SUBF8:
if (Imm == -32768)
UseImm = false;
else {
Opc = PPC::ADDI8;
MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
Imm = -Imm;
}
break;
}
if (UseImm) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
ResultReg)
.addReg(SrcReg1)
.addImm(Imm);
updateValueMap(I, ResultReg);
return true;
}
}
}
// Reg-reg case.
unsigned SrcReg2 = getRegForValue(I->getOperand(1));
if (SrcReg2 == 0) return false;
// Reverse operands for subtract-from.
if (ISDOpcode == ISD::SUB)
std::swap(SrcReg1, SrcReg2);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(SrcReg1).addReg(SrcReg2);
updateValueMap(I, ResultReg);
return true;
}
// Handle arguments to a call that we're attempting to fast-select.
// Return false if the arguments are too complex for us at the moment.
bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
SmallVectorImpl<unsigned> &ArgRegs,
SmallVectorImpl<MVT> &ArgVTs,
SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
SmallVectorImpl<unsigned> &RegArgs,
CallingConv::ID CC,
unsigned &NumBytes,
bool IsVarArg) {
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context);
2014-06-23 13:47:52 +00:00
// Reserve space for the linkage area on the stack.
unsigned LinkageSize = PPCSubTarget->getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, 8);
2014-06-23 13:47:52 +00:00
CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
// Bail out if we can't handle any of the arguments.
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
CCValAssign &VA = ArgLocs[I];
MVT ArgVT = ArgVTs[VA.getValNo()];
// Skip vector arguments for now, as well as long double and
// uint128_t, and anything that isn't passed in a register.
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 ||
!VA.isRegLoc() || VA.needsCustom())
return false;
// Skip bit-converted arguments for now.
if (VA.getLocInfo() == CCValAssign::BCvt)
return false;
}
// Get a count of how many bytes are to be pushed onto the stack.
NumBytes = CCInfo.getNextStackOffset();
2014-06-23 13:47:52 +00:00
// The prolog code of the callee may store up to 8 GPR argument registers to
// the stack, allowing va_start to index over them in memory if its varargs.
// Because we cannot tell if this is needed on the caller side, we have to
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
[PowerPC] ELFv2 stack space reduction The ELFv2 ABI reduces the amount of stack required to implement an ABI-compliant function call in two ways: * the "linkage area" is reduced from 48 bytes to 32 bytes by eliminating two unused doublewords * the 64-byte "parameter save area" is now optional and need not be present in certain cases (it remains mandatory in functions with variable arguments, and functions that have any parameter that is passed on the stack) The following patch implements this required changes: - reducing the linkage area, and associated relocation of the TOC save slot, in getLinkageSize / getTOCSaveOffset (this requires updating all callers of these routines to pass in the isELFv2ABI flag). - (partially) handling the case where the parameter save are is optional This latter part requires some extra explanation: Currently, we still always allocate the parameter save area when *calling* a function. That is certainly always compliant with the ABI, but may cause code to allocate stack unnecessarily. This can be addressed by a follow-on optimization patch. On the *callee* side, in LowerFormalArguments, we *must* track correctly whether the ABI guarantees that the caller has allocated the parameter save area for our use, and the patch does so. However, there is one complication: the code that handles incoming "byval" arguments will currently *always* write to the parameter save area, because it has to force incoming register arguments to the stack since it must return an *address* to implement the byval semantics. To fix this, the patch changes the LowerFormalArguments code to write arguments to a freshly allocated stack slot on the function's own stack frame instead of the argument save area in those cases where that area is not present. Reviewed by Hal Finkel. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213490 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-20 23:43:15 +00:00
// FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
NumBytes = std::max(NumBytes, LinkageSize + 64);
2014-06-23 13:47:52 +00:00
// Issue CALLSEQ_START.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TII.getCallFrameSetupOpcode()))
.addImm(NumBytes);
// Prepare to assign register arguments. Every argument uses up a
// GPR protocol register even if it's passed in a floating-point
// register (unless we're using the fast calling convention).
unsigned NextGPR = PPC::X3;
unsigned NextFPR = PPC::F1;
// Process arguments.
for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
CCValAssign &VA = ArgLocs[I];
unsigned Arg = ArgRegs[VA.getValNo()];
MVT ArgVT = ArgVTs[VA.getValNo()];
// Handle argument promotion and bitcasts.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::SExt: {
MVT DestVT = VA.getLocVT();
const TargetRegisterClass *RC =
(DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
unsigned TmpReg = createResultReg(RC);
if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
llvm_unreachable("Failed to emit a sext!");
ArgVT = DestVT;
Arg = TmpReg;
break;
}
case CCValAssign::AExt:
case CCValAssign::ZExt: {
MVT DestVT = VA.getLocVT();
const TargetRegisterClass *RC =
(DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
unsigned TmpReg = createResultReg(RC);
if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
llvm_unreachable("Failed to emit a zext!");
ArgVT = DestVT;
Arg = TmpReg;
break;
}
case CCValAssign::BCvt: {
// FIXME: Not yet handled.
llvm_unreachable("Should have bailed before getting here!");
break;
}
}
// Copy this argument to the appropriate register.
unsigned ArgReg;
if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
ArgReg = NextFPR++;
if (CC != CallingConv::Fast)
++NextGPR;
} else
ArgReg = NextGPR++;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg);
RegArgs.push_back(ArgReg);
}
return true;
}
// For a call that we've determined we can fast-select, finish the
// call sequence and generate a copy to obtain the return value (if any).
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes) {
CallingConv::ID CC = CLI.CallConv;
// Issue CallSEQ_END.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TII.getCallFrameDestroyOpcode()))
.addImm(NumBytes).addImm(0);
// Next, generate a copy to obtain the return value.
// FIXME: No multi-register return values yet, though I don't foresee
// any real difficulties there.
if (RetVT != MVT::isVoid) {
SmallVector<CCValAssign, 16> RVLocs;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
CCValAssign &VA = RVLocs[0];
assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
assert(VA.isRegLoc() && "Can only return in registers!");
MVT DestVT = VA.getValVT();
MVT CopyVT = DestVT;
// Ints smaller than a register still arrive in a full 64-bit
// register, so make sure we recognize this.
if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
CopyVT = MVT::i64;
unsigned SourcePhysReg = VA.getLocReg();
unsigned ResultReg = 0;
if (RetVT == CopyVT) {
const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
ResultReg = createResultReg(CpyRC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(SourcePhysReg);
// If necessary, round the floating result to single precision.
} else if (CopyVT == MVT::f64) {
ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP),
ResultReg).addReg(SourcePhysReg);
// If only the low half of a general register is needed, generate
// a GPRC copy instead of a G8RC copy. (EXTRACT_SUBREG can't be
// used along the fast-isel path (not lowered), and downstream logic
// also doesn't like a direct subreg copy on a physical reg.)
} else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
ResultReg = createResultReg(&PPC::GPRCRegClass);
// Convert physical register from G8RC to GPRC.
SourcePhysReg -= PPC::X0 - PPC::R0;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(SourcePhysReg);
}
assert(ResultReg && "ResultReg unset!");
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
CLI.InRegs.push_back(SourcePhysReg);
CLI.ResultReg = ResultReg;
CLI.NumResultRegs = 1;
}
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
return true;
}
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
CallingConv::ID CC = CLI.CallConv;
bool IsTailCall = CLI.IsTailCall;
bool IsVarArg = CLI.IsVarArg;
const Value *Callee = CLI.Callee;
const MCSymbol *Symbol = CLI.Symbol;
if (!Callee && !Symbol)
return false;
// Allow SelectionDAG isel to handle tail calls.
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
if (IsTailCall)
return false;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
// Let SDISel handle vararg functions.
if (IsVarArg)
return false;
// Handle simple calls for now, with legal return types and
// those that can be extended.
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
Type *RetTy = CLI.RetTy;
MVT RetVT;
if (RetTy->isVoidTy())
RetVT = MVT::isVoid;
else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
RetVT != MVT::i8)
return false;
else if (RetVT == MVT::i1 && PPCSubTarget->useCRBits())
// We can't handle boolean returns when CR bits are in use.
return false;
// FIXME: No multi-register return values yet.
if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
RetVT != MVT::f64) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
if (RVLocs.size() > 1)
return false;
}
// Bail early if more than 8 arguments, as we only currently
// handle arguments passed in registers.
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
unsigned NumArgs = CLI.OutVals.size();
if (NumArgs > 8)
return false;
// Set up the argument vectors.
SmallVector<Value*, 8> Args;
SmallVector<unsigned, 8> ArgRegs;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
Args.reserve(NumArgs);
ArgRegs.reserve(NumArgs);
ArgVTs.reserve(NumArgs);
ArgFlags.reserve(NumArgs);
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
for (unsigned i = 0, ie = NumArgs; i != ie; ++i) {
// Only handle easy calls for now. It would be reasonably easy
// to handle <= 8-byte structures passed ByVal in registers, but we
// have to ensure they are right-justified in the register.
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
ISD::ArgFlagsTy Flags = CLI.OutFlags[i];
if (Flags.isInReg() || Flags.isSRet() || Flags.isNest() || Flags.isByVal())
return false;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
Value *ArgValue = CLI.OutVals[i];
Type *ArgTy = ArgValue->getType();
MVT ArgVT;
if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
return false;
if (ArgVT.isVector())
return false;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
unsigned Arg = getRegForValue(ArgValue);
if (Arg == 0)
return false;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
Args.push_back(ArgValue);
ArgRegs.push_back(Arg);
ArgVTs.push_back(ArgVT);
ArgFlags.push_back(Flags);
}
// Process the arguments.
SmallVector<unsigned, 8> RegArgs;
unsigned NumBytes;
if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
RegArgs, CC, NumBytes, IsVarArg))
return false;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
MachineInstrBuilder MIB;
// FIXME: No handling for function pointers yet. This requires
// implementing the function descriptor (OPD) setup.
const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
if (!GV) {
// patchpoints are a special case; they always dispatch to a pointer value.
// However, we don't actually want to generate the indirect call sequence
// here (that will be generated, as necessary, during asm printing), and
// the call we generate here will be erased by FastISel::selectPatchpoint,
// so don't try very hard...
if (CLI.IsPatchPoint)
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::NOP));
else
return false;
} else {
// Build direct call with NOP for TOC restore.
// FIXME: We can and should optimize away the NOP for local calls.
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(PPC::BL8_NOP));
// Add callee.
MIB.addGlobalAddress(GV);
}
// Add implicit physical register uses to the call.
for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
MIB.addReg(RegArgs[II], RegState::Implicit);
[PowerPC] ELFv2 function call changes This patch builds upon the two preceding MC changes to implement the basic ELFv2 function call convention. In the ELFv1 ABI, a "function descriptor" was associated with every function, pointing to both the entry address and the related TOC base (and a static chain pointer for nested functions). Function pointers would actually refer to that descriptor, and the indirect call sequence needed to load up both entry address and TOC base. In the ELFv2 ABI, there are no more function descriptors, and function pointers simply refer to the (global) entry point of the function code. Indirect function calls simply branch to that address, after loading it up into r12 (as required by the ABI rules for a global entry point). Direct function calls continue to just do a "bl" to the target symbol; this will be resolved by the linker to the local entry point of the target function if it is local, and to a PLT stub if it is global. That PLT stub would then load the (global) entry point address of the final target into r12 and branch to it. Note that when performing a local function call, r2 must be set up to point to the current TOC base: if the target ends up local, the ABI requires that its local entry point is called with r2 set up; if the target ends up global, the PLT stub requires that r2 is set up. This patch implements all LLVM changes to implement that scheme: - No longer create a function descriptor when emitting a function definition (in EmitFunctionEntryLabel) - Emit two entry points *if* the function needs the TOC base (r2) anywhere (this is done EmitFunctionBodyStart; note that this cannot be done in EmitFunctionBodyStart because the global entry point prologue code must be *part* of the function as covered by debug info). - In order to make use tracking of r2 (as needed above) work correctly, mark direct function calls as implicitly using r2. - Implement the ELFv2 indirect function call sequence (no function descriptors; load target address into r12). - When creating an ELFv2 object file, emit the .abiversion 2 directive to tell the linker to create the appropriate version of PLT stubs. Reviewed by Hal Finkel. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213489 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-20 23:31:44 +00:00
// Direct calls, in both the ELF V1 and V2 ABIs, need the TOC register live
// into the call.
PPCFuncInfo->setUsesTOCBasePtr();
MIB.addReg(PPC::X2, RegState::Implicit);
// Add a register mask with the call-preserved registers. Proper
// defs for return values will be added by setPhysRegsDeadExcept().
MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
CLI.Call = MIB;
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
// Finish off the call including any return values.
return finishCall(RetVT, CLI, NumBytes);
}
// Attempt to fast-select a return instruction.
bool PPCFastISel::SelectRet(const Instruction *I) {
if (!FuncInfo.CanLowerReturn)
return false;
const ReturnInst *Ret = cast<ReturnInst>(I);
const Function &F = *I->getParent()->getParent();
// Build a list of return value registers.
SmallVector<unsigned, 4> RetRegs;
CallingConv::ID CC = F.getCallingConv();
if (Ret->getNumOperands() > 0) {
SmallVector<ISD::OutputArg, 4> Outs;
GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ValLocs;
CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, *Context);
CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
const Value *RV = Ret->getOperand(0);
// FIXME: Only one output register for now.
if (ValLocs.size() > 1)
return false;
// Special case for returning a constant integer of any size.
// Materialize the constant as an i64 and copy it to the return
// register. We still need to worry about properly extending the sign. E.g:
// If the constant has only one bit, it means it is a boolean. Therefore
// we can't use PPCMaterializeInt because it extends the sign which will
// cause negations of the returned value to be incorrect as they are
// implemented as the flip of the least significant bit.
if (isa<ConstantInt>(*RV)) {
const Constant *C = cast<Constant>(RV);
CCValAssign &VA = ValLocs[0];
unsigned RetReg = VA.getLocReg();
unsigned SrcReg = PPCMaterializeInt(C, MVT::i64,
VA.getLocInfo() == CCValAssign::SExt);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
RetRegs.push_back(RetReg);
} else {
unsigned Reg = getRegForValue(RV);
if (Reg == 0)
return false;
// Copy the result values into the output registers.
for (unsigned i = 0; i < ValLocs.size(); ++i) {
CCValAssign &VA = ValLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
RetRegs.push_back(VA.getLocReg());
unsigned SrcReg = Reg + VA.getValNo();
EVT RVEVT = TLI.getValueType(DL, RV->getType());
if (!RVEVT.isSimple())
return false;
MVT RVVT = RVEVT.getSimpleVT();
MVT DestVT = VA.getLocVT();
if (RVVT != DestVT && RVVT != MVT::i8 &&
RVVT != MVT::i16 && RVVT != MVT::i32)
return false;
if (RVVT != DestVT) {
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
llvm_unreachable("Full value assign but types don't match?");
case CCValAssign::AExt:
case CCValAssign::ZExt: {
const TargetRegisterClass *RC =
(DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
unsigned TmpReg = createResultReg(RC);
if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
return false;
SrcReg = TmpReg;
break;
}
case CCValAssign::SExt: {
const TargetRegisterClass *RC =
(DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
unsigned TmpReg = createResultReg(RC);
if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
return false;
SrcReg = TmpReg;
break;
}
}
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), RetRegs[i])
.addReg(SrcReg);
}
}
}
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(PPC::BLR8));
for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
MIB.addReg(RetRegs[i], RegState::Implicit);
return true;
}
// Attempt to emit an integer extend of SrcReg into DestReg. Both
// signed and zero extensions are supported. Return false if we
// can't handle it.
bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
unsigned DestReg, bool IsZExt) {
if (DestVT != MVT::i32 && DestVT != MVT::i64)
return false;
if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
return false;
// Signed extensions use EXTSB, EXTSH, EXTSW.
if (!IsZExt) {
unsigned Opc;
if (SrcVT == MVT::i8)
Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
else if (SrcVT == MVT::i16)
Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
else {
assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
Opc = PPC::EXTSW_32_64;
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addReg(SrcReg);
// Unsigned 32-bit extensions use RLWINM.
} else if (DestVT == MVT::i32) {
unsigned MB;
if (SrcVT == MVT::i8)
MB = 24;
else {
assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
MB = 16;
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM),
DestReg)
.addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
// Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
} else {
unsigned MB;
if (SrcVT == MVT::i8)
MB = 56;
else if (SrcVT == MVT::i16)
MB = 48;
else
MB = 32;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(PPC::RLDICL_32_64), DestReg)
.addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
}
return true;
}
// Attempt to fast-select an indirect branch instruction.
bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
unsigned AddrReg = getRegForValue(I->getOperand(0));
if (AddrReg == 0)
return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8))
.addReg(AddrReg);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
const IndirectBrInst *IB = cast<IndirectBrInst>(I);
for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
return true;
}
// Attempt to fast-select an integer truncate instruction.
bool PPCFastISel::SelectTrunc(const Instruction *I) {
Value *Src = I->getOperand(0);
EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
EVT DestVT = TLI.getValueType(DL, I->getType(), true);
if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
return false;
if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
return false;
unsigned SrcReg = getRegForValue(Src);
if (!SrcReg)
return false;
// The only interesting case is when we need to switch register classes.
if (SrcVT == MVT::i64) {
unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY),
ResultReg).addReg(SrcReg, 0, PPC::sub_32);
SrcReg = ResultReg;
}
updateValueMap(I, SrcReg);
return true;
}
// Attempt to fast-select an integer extend instruction.
bool PPCFastISel::SelectIntExt(const Instruction *I) {
Type *DestTy = I->getType();
Value *Src = I->getOperand(0);
Type *SrcTy = Src->getType();
bool IsZExt = isa<ZExtInst>(I);
unsigned SrcReg = getRegForValue(Src);
if (!SrcReg) return false;
EVT SrcEVT, DestEVT;
SrcEVT = TLI.getValueType(DL, SrcTy, true);
DestEVT = TLI.getValueType(DL, DestTy, true);
if (!SrcEVT.isSimple())
return false;
if (!DestEVT.isSimple())
return false;
MVT SrcVT = SrcEVT.getSimpleVT();
MVT DestVT = DestEVT.getSimpleVT();
// If we know the register class needed for the result of this
// instruction, use it. Otherwise pick the register class of the
// correct size that does not contain X0/R0, since we don't know
// whether downstream uses permit that assignment.
unsigned AssignedReg = FuncInfo.ValueMap[I];
const TargetRegisterClass *RC =
(AssignedReg ? MRI.getRegClass(AssignedReg) :
(DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
&PPC::GPRC_and_GPRC_NOR0RegClass));
unsigned ResultReg = createResultReg(RC);
if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
return false;
updateValueMap(I, ResultReg);
return true;
}
// Attempt to fast-select an instruction that wasn't handled by
// the table-generated machinery.
bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
switch (I->getOpcode()) {
case Instruction::Load:
return SelectLoad(I);
case Instruction::Store:
return SelectStore(I);
case Instruction::Br:
return SelectBranch(I);
case Instruction::IndirectBr:
return SelectIndirectBr(I);
case Instruction::FPExt:
return SelectFPExt(I);
case Instruction::FPTrunc:
return SelectFPTrunc(I);
case Instruction::SIToFP:
return SelectIToFP(I, /*IsSigned*/ true);
case Instruction::UIToFP:
return SelectIToFP(I, /*IsSigned*/ false);
case Instruction::FPToSI:
return SelectFPToI(I, /*IsSigned*/ true);
case Instruction::FPToUI:
return SelectFPToI(I, /*IsSigned*/ false);
case Instruction::Add:
return SelectBinaryIntOp(I, ISD::ADD);
case Instruction::Or:
return SelectBinaryIntOp(I, ISD::OR);
case Instruction::Sub:
return SelectBinaryIntOp(I, ISD::SUB);
case Instruction::Call:
Revert "r225811 - Revert "r225808 - [PowerPC] Add StackMap/PatchPoint support"" This re-applies r225808, fixed to avoid problems with SDAG dependencies along with the preceding fix to ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs. These problems caused the original regression tests to assert/segfault on many (but not all) systems. Original commit message: This commit does two things: 1. Refactors PPCFastISel to use more of the common infrastructure for call lowering (this lets us take advantage of this common code for lowering some common intrinsics, stackmap/patchpoint among them). 2. Adds support for stackmap/patchpoint lowering. For the most part, this is very similar to the support in the AArch64 target, with the obvious differences (different registers, NOP instructions, etc.). The test cases are adapted from the AArch64 test cases. One difference of note is that the patchpoint call sequence takes 24 bytes, so you can't use less than that (on AArch64 you can go down to 16). Also, as noted in the docs, we take the patchpoint address to be the actual code address (assuming the call is local in the TOC-sharing sense), which should yield higher performance than generating the full cross-DSO indirect-call sequence and is likely just as useful for JITed code (if not, we'll change it). StackMaps and Patchpoints are still marked as experimental, and so this support is doubly experimental. So go ahead and experiment! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225909 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-14 01:07:51 +00:00
return selectCall(I);
case Instruction::Ret:
return SelectRet(I);
case Instruction::Trunc:
return SelectTrunc(I);
case Instruction::ZExt:
case Instruction::SExt:
return SelectIntExt(I);
// Here add other flavors of Instruction::XXX that automated
// cases don't catch. For example, switches are terminators
// that aren't yet handled.
default:
break;
}
return false;
}
// Materialize a floating-point constant into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
// No plans to handle long double here.
if (VT != MVT::f32 && VT != MVT::f64)
return 0;
// All FP constants are loaded from the constant pool.
unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
assert(Align > 0 && "Unexpectedly missing alignment information!");
unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
CodeModel::Model CModel = TM.getCodeModel();
MachineMemOperand *MMO =
FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
(VT == MVT::f32) ? 4 : 8, Align);
unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
PPCFuncInfo->setUsesTOCBasePtr();
// For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
TmpReg)
.addConstantPoolIndex(Idx).addReg(PPC::X2);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addImm(0).addReg(TmpReg).addMemOperand(MMO);
} else {
// Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
// But for large code model, we must generate a LDtocL followed
// by the LF[SD].
if (CModel == CodeModel::Large) {
unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addImm(0).addReg(TmpReg2);
} else
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
.addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
.addReg(TmpReg)
.addMemOperand(MMO);
}
return DestReg;
}
// Materialize the address of a global value into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
assert(VT == MVT::i64 && "Non-address!");
const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
unsigned DestReg = createResultReg(RC);
// Global values may be plain old object addresses, TLS object
// addresses, constant pool entries, or jump tables. How we generate
// code for these may depend on small, medium, or large code model.
CodeModel::Model CModel = TM.getCodeModel();
// FIXME: Jump tables are not yet required because fast-isel doesn't
// handle switches; if that changes, we need them as well. For now,
// what follows assumes everything's a generic (or TLS) global address.
// FIXME: We don't yet handle the complexity of TLS.
if (GV->isThreadLocal())
return 0;
PPCFuncInfo->setUsesTOCBasePtr();
// For small code model, generate a simple TOC load.
if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
DestReg)
.addGlobalAddress(GV)
.addReg(PPC::X2);
else {
// If the address is an externally defined symbol, a symbol with common
// or externally available linkage, a non-local function address, or a
// jump table address (not yet needed), or if we are generating code
// for large code model, we generate:
// LDtocL(GV, ADDIStocHA(%X2, GV))
// Otherwise we generate:
// ADDItocL(ADDIStocHA(%X2, GV), GV)
// Either way, start with the ADDIStocHA:
unsigned HighPartReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
// If/when switches are implemented, jump tables should be handled
// on the "if" path here.
if (CModel == CodeModel::Large ||
(GV->getType()->getElementType()->isFunctionTy() &&
!GV->isStrongDefinitionForLinker()) ||
GV->isDeclaration() || GV->hasCommonLinkage() ||
GV->hasAvailableExternallyLinkage())
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
DestReg).addGlobalAddress(GV).addReg(HighPartReg);
else
// Otherwise generate the ADDItocL.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
DestReg).addReg(HighPartReg).addGlobalAddress(GV);
}
return DestReg;
}
// Materialize a 32-bit integer constant into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
const TargetRegisterClass *RC) {
unsigned Lo = Imm & 0xFFFF;
unsigned Hi = (Imm >> 16) & 0xFFFF;
unsigned ResultReg = createResultReg(RC);
bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
if (isInt<16>(Imm))
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg)
.addImm(Imm);
else if (Lo) {
// Both Lo and Hi have nonzero bits.
unsigned TmpReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
.addImm(Hi);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg)
.addReg(TmpReg).addImm(Lo);
} else
// Just Hi bits.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg)
.addImm(Hi);
return ResultReg;
}
// Materialize a 64-bit integer constant into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
const TargetRegisterClass *RC) {
unsigned Remainder = 0;
unsigned Shift = 0;
// If the value doesn't fit in 32 bits, see if we can shift it
// so that it fits in 32 bits.
if (!isInt<32>(Imm)) {
Shift = countTrailingZeros<uint64_t>(Imm);
int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
if (isInt<32>(ImmSh))
Imm = ImmSh;
else {
Remainder = Imm;
Shift = 32;
Imm >>= 32;
}
}
// Handle the high-order 32 bits (if shifted) or the whole 32 bits
// (if not shifted).
unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC);
if (!Shift)
return TmpReg1;
// If upper 32 bits were not zero, we've built them and need to shift
// them into place.
unsigned TmpReg2;
if (Imm) {
TmpReg2 = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR),
TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift);
} else
TmpReg2 = TmpReg1;
unsigned TmpReg3, Hi, Lo;
if ((Hi = (Remainder >> 16) & 0xFFFF)) {
TmpReg3 = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8),
TmpReg3).addReg(TmpReg2).addImm(Hi);
} else
TmpReg3 = TmpReg2;
if ((Lo = Remainder & 0xFFFF)) {
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8),
ResultReg).addReg(TmpReg3).addImm(Lo);
return ResultReg;
}
return TmpReg3;
}
// Materialize an integer constant into a register, and return
// the register number (or zero if we failed to handle it).
unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
bool UseSExt) {
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
// If we're using CR bit registers for i1 values, handle that as a special
// case first.
if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
const ConstantInt *CI = cast<ConstantInt>(C);
unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
return ImmReg;
}
if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
VT != MVT::i8 && VT != MVT::i1)
return 0;
const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
&PPC::GPRCRegClass);
// If the constant is in range, use a load-immediate.
const ConstantInt *CI = cast<ConstantInt>(C);
if (isInt<16>(CI->getSExtValue())) {
unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
unsigned ImmReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
.addImm( (UseSExt) ? CI->getSExtValue() : CI->getZExtValue() );
return ImmReg;
}
// Construct the constant piecewise.
int64_t Imm = CI->getZExtValue();
if (VT == MVT::i64)
return PPCMaterialize64BitInt(Imm, RC);
else if (VT == MVT::i32)
return PPCMaterialize32BitInt(Imm, RC);
return 0;
}
// Materialize a constant into a register, and return the register
// number (or zero if we failed to handle it).
unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
EVT CEVT = TLI.getValueType(DL, C->getType(), true);
// Only handle simple types.
if (!CEVT.isSimple()) return 0;
MVT VT = CEVT.getSimpleVT();
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
return PPCMaterializeFP(CFP, VT);
else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
return PPCMaterializeGV(GV, VT);
else if (isa<ConstantInt>(C))
return PPCMaterializeInt(C, VT, VT != MVT::i1);
return 0;
}
// Materialize the address created by an alloca into a register, and
// return the register number (or zero if we failed to handle it).
unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
// Don't handle dynamic allocas.
if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
MVT VT;
if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
DenseMap<const AllocaInst*, int>::iterator SI =
FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end()) {
unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
ResultReg).addFrameIndex(SI->second).addImm(0);
return ResultReg;
}
return 0;
}
// Fold loads into extends when possible.
// FIXME: We can have multiple redundant extend/trunc instructions
// following a load. The folding only picks up one. Extend this
// to check subsequent instructions for the same pattern and remove
// them. Thus ResultReg should be the def reg for the last redundant
// instruction in a chain, and all intervening instructions can be
// removed from parent. Change test/CodeGen/PowerPC/fast-isel-fold.ll
// to add ELF64-NOT: rldicl to the appropriate tests when this works.
bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
const LoadInst *LI) {
// Verify we have a legal type before going any further.
MVT VT;
if (!isLoadTypeLegal(LI->getType(), VT))
return false;
// Combine load followed by zero- or sign-extend.
bool IsZExt = false;
switch(MI->getOpcode()) {
default:
return false;
case PPC::RLDICL:
case PPC::RLDICL_32_64: {
IsZExt = true;
unsigned MB = MI->getOperand(3).getImm();
if ((VT == MVT::i8 && MB <= 56) ||
(VT == MVT::i16 && MB <= 48) ||
(VT == MVT::i32 && MB <= 32))
break;
return false;
}
case PPC::RLWINM:
case PPC::RLWINM8: {
IsZExt = true;
unsigned MB = MI->getOperand(3).getImm();
if ((VT == MVT::i8 && MB <= 24) ||
(VT == MVT::i16 && MB <= 16))
break;
return false;
}
case PPC::EXTSB:
case PPC::EXTSB8:
case PPC::EXTSB8_32_64:
/* There is no sign-extending load-byte instruction. */
return false;
case PPC::EXTSH:
case PPC::EXTSH8:
case PPC::EXTSH8_32_64: {
if (VT != MVT::i16 && VT != MVT::i8)
return false;
break;
}
case PPC::EXTSW:
case PPC::EXTSW_32_64: {
if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
return false;
break;
}
}
// See if we can handle this address.
Address Addr;
if (!PPCComputeAddress(LI->getOperand(0), Addr))
return false;
unsigned ResultReg = MI->getOperand(0).getReg();
if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
return false;
MI->eraseFromParent();
return true;
}
// Attempt to lower call arguments in a faster way than done by
// the selection DAG code.
bool PPCFastISel::fastLowerArguments() {
// Defer to normal argument lowering for now. It's reasonably
// efficient. Consider doing something like ARM to handle the
// case where all args fit in registers, no varargs, no float
// or vector args.
return false;
}
// Handle materializing integer constants into a register. This is not
// automatically generated for PowerPC, so must be explicitly created here.
unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
if (Opc != ISD::Constant)
return 0;
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
// If we're using CR bit registers for i1 values, handle that as a special
// case first.
if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
return ImmReg;
}
if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
VT != MVT::i8 && VT != MVT::i1)
return 0;
const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
&PPC::GPRCRegClass);
if (VT == MVT::i64)
return PPCMaterialize64BitInt(Imm, RC);
else
return PPCMaterialize32BitInt(Imm, RC);
}
// Override for ADDI and ADDI8 to set the correct register class
// on RHS operand 0. The automatic infrastructure naively assumes
// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
// for these cases. At the moment, none of the other automatically
// generated RI instructions require special treatment. However, once
// SelectSelect is implemented, "isel" requires similar handling.
//
// Also be conservative about the output register class. Avoid
// assigning R0 or X0 to the output register for GPRC and G8RC
// register classes, as any such result could be used in ADDI, etc.,
// where those regs have another meaning.
unsigned PPCFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
const TargetRegisterClass *RC,
unsigned Op0, bool Op0IsKill,
uint64_t Imm) {
if (MachineInstOpcode == PPC::ADDI)
MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
else if (MachineInstOpcode == PPC::ADDI8)
MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
const TargetRegisterClass *UseRC =
(RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
(RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC,
Op0, Op0IsKill, Imm);
}
// Override for instructions with one register operand to avoid use of
// R0/X0. The automatic infrastructure isn't aware of the context so
// we must be conservative.
unsigned PPCFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
const TargetRegisterClass* RC,
unsigned Op0, bool Op0IsKill) {
const TargetRegisterClass *UseRC =
(RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
(RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
}
// Override for instructions with two register operands to avoid use
// of R0/X0. The automatic infrastructure isn't aware of the context
// so we must be conservative.
unsigned PPCFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
const TargetRegisterClass* RC,
unsigned Op0, bool Op0IsKill,
unsigned Op1, bool Op1IsKill) {
const TargetRegisterClass *UseRC =
(RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
(RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
Op1, Op1IsKill);
}
namespace llvm {
// Create the fast instruction selector for PowerPC64 ELF.
FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) {
// Only available on 64-bit ELF for now.
const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
if (Subtarget.isPPC64() && Subtarget.isSVR4ABI())
return new PPCFastISel(FuncInfo, LibInfo);
return nullptr;
}
}