llvm-6502/lib/Target/R600/AMDILPeepholeOptimizer.cpp
Chandler Carruth 0b8c9a80f2 Move all of the header files which are involved in modelling the LLVM IR
into their new header subdirectory: include/llvm/IR. This matches the
directory structure of lib, and begins to correct a long standing point
of file layout clutter in LLVM.

There are still more header files to move here, but I wanted to handle
them in separate commits to make tracking what files make sense at each
layer easier.

The only really questionable files here are the target intrinsic
tablegen files. But that's a battle I'd rather not fight today.

I've updated both CMake and Makefile build systems (I think, and my
tests think, but I may have missed something).

I've also re-sorted the includes throughout the project. I'll be
committing updates to Clang, DragonEgg, and Polly momentarily.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171366 91177308-0d34-0410-b5e6-96231b3b80d8
2013-01-02 11:36:10 +00:00

1216 lines
39 KiB
C++

//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
/// \file
//==-----------------------------------------------------------------------===//
#define DEBUG_TYPE "PeepholeOpt"
#ifdef DEBUG
#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
#else
#define DEBUGME 0
#endif
#include "AMDILDevices.h"
#include "AMDGPUInstrInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/IR/Constants.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include <sstream>
#if 0
STATISTIC(PointerAssignments, "Number of dynamic pointer "
"assigments discovered");
STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
#endif
using namespace llvm;
// The Peephole optimization pass is used to do simple last minute optimizations
// that are required for correct code or to remove redundant functions
namespace {
class OpaqueType;
class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
public:
TargetMachine &TM;
static char ID;
AMDGPUPeepholeOpt(TargetMachine &tm);
~AMDGPUPeepholeOpt();
const char *getPassName() const;
bool runOnFunction(Function &F);
bool doInitialization(Module &M);
bool doFinalization(Module &M);
void getAnalysisUsage(AnalysisUsage &AU) const;
protected:
private:
// Function to initiate all of the instruction level optimizations.
bool instLevelOptimizations(BasicBlock::iterator *inst);
// Quick check to see if we need to dump all of the pointers into the
// arena. If this is correct, then we set all pointers to exist in arena. This
// is a workaround for aliasing of pointers in a struct/union.
bool dumpAllIntoArena(Function &F);
// Because I don't want to invalidate any pointers while in the
// safeNestedForEachFunction. I push atomic conversions to a vector and handle
// it later. This function does the conversions if required.
void doAtomicConversionIfNeeded(Function &F);
// Because __amdil_is_constant cannot be properly evaluated if
// optimizations are disabled, the call's are placed in a vector
// and evaluated after the __amdil_image* functions are evaluated
// which should allow the __amdil_is_constant function to be
// evaluated correctly.
void doIsConstCallConversionIfNeeded();
bool mChanged;
bool mDebug;
bool mConvertAtomics;
CodeGenOpt::Level optLevel;
// Run a series of tests to see if we can optimize a CALL instruction.
bool optimizeCallInst(BasicBlock::iterator *bbb);
// A peephole optimization to optimize bit extract sequences.
bool optimizeBitExtract(Instruction *inst);
// A peephole optimization to optimize bit insert sequences.
bool optimizeBitInsert(Instruction *inst);
bool setupBitInsert(Instruction *base,
Instruction *&src,
Constant *&mask,
Constant *&shift);
// Expand the bit field insert instruction on versions of OpenCL that
// don't support it.
bool expandBFI(CallInst *CI);
// Expand the bit field mask instruction on version of OpenCL that
// don't support it.
bool expandBFM(CallInst *CI);
// On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
// this case we need to expand them. These functions check for 24bit functions
// and then expand.
bool isSigned24BitOps(CallInst *CI);
void expandSigned24BitOps(CallInst *CI);
// One optimization that can occur is that if the required workgroup size is
// specified then the result of get_local_size is known at compile time and
// can be returned accordingly.
bool isRWGLocalOpt(CallInst *CI);
// On northern island cards, the division is slightly less accurate than on
// previous generations, so we need to utilize a more accurate division. So we
// can translate the accurate divide to a normal divide on all other cards.
bool convertAccurateDivide(CallInst *CI);
void expandAccurateDivide(CallInst *CI);
// If the alignment is set incorrectly, it can produce really inefficient
// code. This checks for this scenario and fixes it if possible.
bool correctMisalignedMemOp(Instruction *inst);
// If we are in no opt mode, then we need to make sure that
// local samplers are properly propagated as constant propagation
// doesn't occur and we need to know the value of kernel defined
// samplers at compile time.
bool propagateSamplerInst(CallInst *CI);
// Helper functions
// Group of functions that recursively calculate the size of a structure based
// on it's sub-types.
size_t getTypeSize(Type * const T, bool dereferencePtr = false);
size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
LLVMContext *mCTX;
Function *mF;
const AMDGPUSubtarget *mSTM;
SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
SmallVector<CallInst *, 16> isConstVec;
}; // class AMDGPUPeepholeOpt
char AMDGPUPeepholeOpt::ID = 0;
// A template function that has two levels of looping before calling the
// function with a pointer to the current iterator.
template<class InputIterator, class SecondIterator, class Function>
Function safeNestedForEach(InputIterator First, InputIterator Last,
SecondIterator S, Function F) {
for ( ; First != Last; ++First) {
SecondIterator sf, sl;
for (sf = First->begin(), sl = First->end();
sf != sl; ) {
if (!F(&sf)) {
++sf;
}
}
}
return F;
}
} // anonymous namespace
namespace llvm {
FunctionPass *
createAMDGPUPeepholeOpt(TargetMachine &tm) {
return new AMDGPUPeepholeOpt(tm);
}
} // llvm namespace
AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
: FunctionPass(ID), TM(tm) {
mDebug = DEBUGME;
optLevel = TM.getOptLevel();
}
AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
}
const char *
AMDGPUPeepholeOpt::getPassName() const {
return "AMDGPU PeepHole Optimization Pass";
}
bool
containsPointerType(Type *Ty) {
if (!Ty) {
return false;
}
switch(Ty->getTypeID()) {
default:
return false;
case Type::StructTyID: {
const StructType *ST = dyn_cast<StructType>(Ty);
for (StructType::element_iterator stb = ST->element_begin(),
ste = ST->element_end(); stb != ste; ++stb) {
if (!containsPointerType(*stb)) {
continue;
}
return true;
}
break;
}
case Type::VectorTyID:
case Type::ArrayTyID:
return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
case Type::PointerTyID:
return true;
};
return false;
}
bool
AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
bool dumpAll = false;
for (Function::const_arg_iterator cab = F.arg_begin(),
cae = F.arg_end(); cab != cae; ++cab) {
const Argument *arg = cab;
const PointerType *PT = dyn_cast<PointerType>(arg->getType());
if (!PT) {
continue;
}
Type *DereferencedType = PT->getElementType();
if (!dyn_cast<StructType>(DereferencedType)
) {
continue;
}
if (!containsPointerType(DereferencedType)) {
continue;
}
// FIXME: Because a pointer inside of a struct/union may be aliased to
// another pointer we need to take the conservative approach and place all
// pointers into the arena until more advanced detection is implemented.
dumpAll = true;
}
return dumpAll;
}
void
AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
if (isConstVec.empty()) {
return;
}
for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
CallInst *CI = isConstVec[x];
Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
Type *aType = Type::getInt32Ty(*mCTX);
Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
: ConstantInt::get(aType, 0);
CI->replaceAllUsesWith(Val);
CI->eraseFromParent();
}
isConstVec.clear();
}
void
AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
// Don't do anything if we don't have any atomic operations.
if (atomicFuncs.empty()) {
return;
}
// Change the function name for the atomic if it is required
uint32_t size = atomicFuncs.size();
for (uint32_t x = 0; x < size; ++x) {
atomicFuncs[x].first->setOperand(
atomicFuncs[x].first->getNumOperands()-1,
atomicFuncs[x].second);
}
mChanged = true;
if (mConvertAtomics) {
return;
}
}
bool
AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
mChanged = false;
mF = &MF;
mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
if (mDebug) {
MF.dump();
}
mCTX = &MF.getType()->getContext();
mConvertAtomics = true;
safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
this));
doAtomicConversionIfNeeded(MF);
doIsConstCallConversionIfNeeded();
if (mDebug) {
MF.dump();
}
return mChanged;
}
bool
AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
Instruction *inst = (*bbb);
CallInst *CI = dyn_cast<CallInst>(inst);
if (!CI) {
return false;
}
if (isSigned24BitOps(CI)) {
expandSigned24BitOps(CI);
++(*bbb);
CI->eraseFromParent();
return true;
}
if (propagateSamplerInst(CI)) {
return false;
}
if (expandBFI(CI) || expandBFM(CI)) {
++(*bbb);
CI->eraseFromParent();
return true;
}
if (convertAccurateDivide(CI)) {
expandAccurateDivide(CI);
++(*bbb);
CI->eraseFromParent();
return true;
}
StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
if (calleeName.startswith("__amdil_is_constant")) {
// If we do not have optimizations, then this
// cannot be properly evaluated, so we add the
// call instruction to a vector and process
// them at the end of processing after the
// samplers have been correctly handled.
if (optLevel == CodeGenOpt::None) {
isConstVec.push_back(CI);
return false;
} else {
Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
Type *aType = Type::getInt32Ty(*mCTX);
Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
: ConstantInt::get(aType, 0);
CI->replaceAllUsesWith(Val);
++(*bbb);
CI->eraseFromParent();
return true;
}
}
if (calleeName.equals("__amdil_is_asic_id_i32")) {
ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
Type *aType = Type::getInt32Ty(*mCTX);
Value *Val = CV;
if (Val) {
Val = ConstantInt::get(aType,
mSTM->device()->getDeviceFlag() & CV->getZExtValue());
} else {
Val = ConstantInt::get(aType, 0);
}
CI->replaceAllUsesWith(Val);
++(*bbb);
CI->eraseFromParent();
return true;
}
Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
if (!F) {
return false;
}
if (F->getName().startswith("__atom") && !CI->getNumUses()
&& F->getName().find("_xchg") == StringRef::npos) {
std::string buffer(F->getName().str() + "_noret");
F = dyn_cast<Function>(
F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
}
if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
&& !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
return false;
}
if (!mConvertAtomics) {
return false;
}
StringRef name = F->getName();
if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
mConvertAtomics = false;
}
return false;
}
bool
AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
Instruction *&src,
Constant *&mask,
Constant *&shift) {
if (!base) {
if (mDebug) {
dbgs() << "Null pointer passed into function.\n";
}
return false;
}
bool andOp = false;
if (base->getOpcode() == Instruction::Shl) {
shift = dyn_cast<Constant>(base->getOperand(1));
} else if (base->getOpcode() == Instruction::And) {
mask = dyn_cast<Constant>(base->getOperand(1));
andOp = true;
} else {
if (mDebug) {
dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
}
// If the base is neither a Shl or a And, we don't fit any of the patterns above.
return false;
}
src = dyn_cast<Instruction>(base->getOperand(0));
if (!src) {
if (mDebug) {
dbgs() << "Failed setup since the base operand is not an instruction!\n";
}
return false;
}
// If we find an 'and' operation, then we don't need to
// find the next operation as we already know the
// bits that are valid at this point.
if (andOp) {
return true;
}
if (src->getOpcode() == Instruction::Shl && !shift) {
shift = dyn_cast<Constant>(src->getOperand(1));
src = dyn_cast<Instruction>(src->getOperand(0));
} else if (src->getOpcode() == Instruction::And && !mask) {
mask = dyn_cast<Constant>(src->getOperand(1));
}
if (!mask && !shift) {
if (mDebug) {
dbgs() << "Failed setup since both mask and shift are NULL!\n";
}
// Did not find a constant mask or a shift.
return false;
}
return true;
}
bool
AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
if (!inst) {
return false;
}
if (!inst->isBinaryOp()) {
return false;
}
if (inst->getOpcode() != Instruction::Or) {
return false;
}
if (optLevel == CodeGenOpt::None) {
return false;
}
// We want to do an optimization on a sequence of ops that in the end equals a
// single ISA instruction.
// The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
// Some simplified versions of this pattern are as follows:
// (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
// ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
// (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
// (A & B) | (D << F) when (1 << F) >= B
// (A << C) | (D & E) when (1 << C) >= E
if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
// The HD4XXX hardware doesn't support the ubit_insert instruction.
return false;
}
Type *aType = inst->getType();
bool isVector = aType->isVectorTy();
int numEle = 1;
// This optimization only works on 32bit integers.
if (aType->getScalarType()
!= Type::getInt32Ty(inst->getContext())) {
return false;
}
if (isVector) {
const VectorType *VT = dyn_cast<VectorType>(aType);
numEle = VT->getNumElements();
// We currently cannot support more than 4 elements in a intrinsic and we
// cannot support Vec3 types.
if (numEle > 4 || numEle == 3) {
return false;
}
}
// TODO: Handle vectors.
if (isVector) {
if (mDebug) {
dbgs() << "!!! Vectors are not supported yet!\n";
}
return false;
}
Instruction *LHSSrc = NULL, *RHSSrc = NULL;
Constant *LHSMask = NULL, *RHSMask = NULL;
Constant *LHSShift = NULL, *RHSShift = NULL;
Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
if (mDebug) {
dbgs() << "Found an OR Operation that failed setup!\n";
inst->dump();
if (LHS) { LHS->dump(); }
if (LHSSrc) { LHSSrc->dump(); }
if (LHSMask) { LHSMask->dump(); }
if (LHSShift) { LHSShift->dump(); }
}
// There was an issue with the setup for BitInsert.
return false;
}
if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
if (mDebug) {
dbgs() << "Found an OR Operation that failed setup!\n";
inst->dump();
if (RHS) { RHS->dump(); }
if (RHSSrc) { RHSSrc->dump(); }
if (RHSMask) { RHSMask->dump(); }
if (RHSShift) { RHSShift->dump(); }
}
// There was an issue with the setup for BitInsert.
return false;
}
if (mDebug) {
dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
dbgs() << "Op: "; inst->dump();
dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
}
Constant *offset = NULL;
Constant *width = NULL;
uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
lhsMaskVal = (LHSMask
? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
rhsMaskVal = (RHSMask
? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
lhsShiftVal = (LHSShift
? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
rhsShiftVal = (RHSShift
? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
// TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
return false;
}
if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
offset = ConstantInt::get(aType, lhsMaskOffset, false);
width = ConstantInt::get(aType, lhsMaskWidth, false);
RHSSrc = RHS;
if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
return false;
}
if (!LHSShift) {
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
"MaskShr", LHS);
} else if (lhsShiftVal != lhsMaskOffset) {
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
"MaskShr", LHS);
}
if (mDebug) {
dbgs() << "Optimizing LHS!\n";
}
} else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
offset = ConstantInt::get(aType, rhsMaskOffset, false);
width = ConstantInt::get(aType, rhsMaskWidth, false);
LHSSrc = RHSSrc;
RHSSrc = LHS;
if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
return false;
}
if (!RHSShift) {
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
"MaskShr", RHS);
} else if (rhsShiftVal != rhsMaskOffset) {
LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
"MaskShr", RHS);
}
if (mDebug) {
dbgs() << "Optimizing RHS!\n";
}
} else {
if (mDebug) {
dbgs() << "Failed constraint 3!\n";
}
return false;
}
if (mDebug) {
dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
}
if (!offset || !width) {
if (mDebug) {
dbgs() << "Either width or offset are NULL, failed detection!\n";
}
return false;
}
// Lets create the function signature.
std::vector<Type *> callTypes;
callTypes.push_back(aType);
callTypes.push_back(aType);
callTypes.push_back(aType);
callTypes.push_back(aType);
FunctionType *funcType = FunctionType::get(aType, callTypes, false);
std::string name = "__amdil_ubit_insert";
if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
Function *Func =
dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
getOrInsertFunction(llvm::StringRef(name), funcType));
Value *Operands[4] = {
width,
offset,
LHSSrc,
RHSSrc
};
CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
if (mDebug) {
dbgs() << "Old Inst: ";
inst->dump();
dbgs() << "New Inst: ";
CI->dump();
dbgs() << "\n\n";
}
CI->insertBefore(inst);
inst->replaceAllUsesWith(CI);
return true;
}
bool
AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
if (!inst) {
return false;
}
if (!inst->isBinaryOp()) {
return false;
}
if (inst->getOpcode() != Instruction::And) {
return false;
}
if (optLevel == CodeGenOpt::None) {
return false;
}
// We want to do some simple optimizations on Shift right/And patterns. The
// basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
// value smaller than 32 and C is a mask. If C is a constant value, then the
// following transformation can occur. For signed integers, it turns into the
// function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
// integers, it turns into the function call dst =
// __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
// can be found in Section 7.9 of the ATI IL spec of the stream SDK for
// Evergreen hardware.
if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
// This does not work on HD4XXX hardware.
return false;
}
Type *aType = inst->getType();
bool isVector = aType->isVectorTy();
// XXX Support vector types
if (isVector) {
return false;
}
int numEle = 1;
// This only works on 32bit integers
if (aType->getScalarType()
!= Type::getInt32Ty(inst->getContext())) {
return false;
}
if (isVector) {
const VectorType *VT = dyn_cast<VectorType>(aType);
numEle = VT->getNumElements();
// We currently cannot support more than 4 elements in a intrinsic and we
// cannot support Vec3 types.
if (numEle > 4 || numEle == 3) {
return false;
}
}
BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
// If the first operand is not a shift instruction, then we can return as it
// doesn't match this pattern.
if (!ShiftInst || !ShiftInst->isShift()) {
return false;
}
// If we are a shift left, then we need don't match this pattern.
if (ShiftInst->getOpcode() == Instruction::Shl) {
return false;
}
bool isSigned = ShiftInst->isArithmeticShift();
Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
// Lets make sure that the shift value and the and mask are constant integers.
if (!AndMask || !ShrVal) {
return false;
}
Constant *newMaskConst;
Constant *shiftValConst;
if (isVector) {
// Handle the vector case
std::vector<Constant *> maskVals;
std::vector<Constant *> shiftVals;
ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
Type *scalarType = AndMaskVec->getType()->getScalarType();
assert(AndMaskVec->getNumOperands() ==
ShrValVec->getNumOperands() && "cannot have a "
"combination where the number of elements to a "
"shift and an and are different!");
for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
if (!AndCI || !ShiftIC) {
return false;
}
uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
if (!isMask_32(maskVal)) {
return false;
}
maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
// If the mask or shiftval is greater than the bitcount, then break out.
if (maskVal >= 32 || shiftVal >= 32) {
return false;
}
// If the mask val is greater than the the number of original bits left
// then this optimization is invalid.
if (maskVal > (32 - shiftVal)) {
return false;
}
maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
}
newMaskConst = ConstantVector::get(maskVals);
shiftValConst = ConstantVector::get(shiftVals);
} else {
// Handle the scalar case
uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
// This must be a mask value where all lower bits are set to 1 and then any
// bit higher is set to 0.
if (!isMask_32(maskVal)) {
return false;
}
maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
// Count the number of bits set in the mask, this is the width of the
// resulting bit set that is extracted from the source value.
uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
// If the mask or shift val is greater than the bitcount, then break out.
if (maskVal >= 32 || shiftVal >= 32) {
return false;
}
// If the mask val is greater than the the number of original bits left then
// this optimization is invalid.
if (maskVal > (32 - shiftVal)) {
return false;
}
newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
}
// Lets create the function signature.
std::vector<Type *> callTypes;
callTypes.push_back(aType);
callTypes.push_back(aType);
callTypes.push_back(aType);
FunctionType *funcType = FunctionType::get(aType, callTypes, false);
std::string name = "llvm.AMDGPU.bit.extract.u32";
if (isVector) {
name += ".v" + itostr(numEle) + "i32";
} else {
name += ".";
}
// Lets create the function.
Function *Func =
dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
getOrInsertFunction(llvm::StringRef(name), funcType));
Value *Operands[3] = {
ShiftInst->getOperand(0),
shiftValConst,
newMaskConst
};
// Lets create the Call with the operands
CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
CI->setDoesNotAccessMemory();
CI->insertBefore(inst);
inst->replaceAllUsesWith(CI);
return true;
}
bool
AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
if (!CI) {
return false;
}
Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
if (!LHS->getName().startswith("__amdil_bfi")) {
return false;
}
Type* type = CI->getOperand(0)->getType();
Constant *negOneConst = NULL;
if (type->isVectorTy()) {
std::vector<Constant *> negOneVals;
negOneConst = ConstantInt::get(CI->getContext(),
APInt(32, StringRef("-1"), 10));
for (size_t x = 0,
y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
negOneVals.push_back(negOneConst);
}
negOneConst = ConstantVector::get(negOneVals);
} else {
negOneConst = ConstantInt::get(CI->getContext(),
APInt(32, StringRef("-1"), 10));
}
// __amdil_bfi => (A & B) | (~A & C)
BinaryOperator *lhs =
BinaryOperator::Create(Instruction::And, CI->getOperand(0),
CI->getOperand(1), "bfi_and", CI);
BinaryOperator *rhs =
BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
"bfi_not", CI);
rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
"bfi_and", CI);
lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
CI->replaceAllUsesWith(lhs);
return true;
}
bool
AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
if (!CI) {
return false;
}
Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
if (!LHS->getName().startswith("__amdil_bfm")) {
return false;
}
// __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
Constant *newMaskConst = NULL;
Constant *newShiftConst = NULL;
Type* type = CI->getOperand(0)->getType();
if (type->isVectorTy()) {
std::vector<Constant*> newMaskVals, newShiftVals;
newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
for (size_t x = 0,
y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
newMaskVals.push_back(newMaskConst);
newShiftVals.push_back(newShiftConst);
}
newMaskConst = ConstantVector::get(newMaskVals);
newShiftConst = ConstantVector::get(newShiftVals);
} else {
newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
}
BinaryOperator *lhs =
BinaryOperator::Create(Instruction::And, CI->getOperand(0),
newMaskConst, "bfm_mask", CI);
lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
lhs, "bfm_shl", CI);
lhs = BinaryOperator::Create(Instruction::Sub, lhs,
newShiftConst, "bfm_sub", CI);
BinaryOperator *rhs =
BinaryOperator::Create(Instruction::And, CI->getOperand(1),
newMaskConst, "bfm_mask", CI);
lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
CI->replaceAllUsesWith(lhs);
return true;
}
bool
AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
Instruction *inst = (*bbb);
if (optimizeCallInst(bbb)) {
return true;
}
if (optimizeBitExtract(inst)) {
return false;
}
if (optimizeBitInsert(inst)) {
return false;
}
if (correctMisalignedMemOp(inst)) {
return false;
}
return false;
}
bool
AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
LoadInst *linst = dyn_cast<LoadInst>(inst);
StoreInst *sinst = dyn_cast<StoreInst>(inst);
unsigned alignment;
Type* Ty = inst->getType();
if (linst) {
alignment = linst->getAlignment();
Ty = inst->getType();
} else if (sinst) {
alignment = sinst->getAlignment();
Ty = sinst->getValueOperand()->getType();
} else {
return false;
}
unsigned size = getTypeSize(Ty);
if (size == alignment || size < alignment) {
return false;
}
if (!Ty->isStructTy()) {
return false;
}
if (alignment < 4) {
if (linst) {
linst->setAlignment(0);
return true;
} else if (sinst) {
sinst->setAlignment(0);
return true;
}
}
return false;
}
bool
AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
if (!CI) {
return false;
}
Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
std::string namePrefix = LHS->getName().substr(0, 14);
if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
&& namePrefix != "__amdil__imul24_high") {
return false;
}
if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
return false;
}
return true;
}
void
AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
assert(isSigned24BitOps(CI) && "Must be a "
"signed 24 bit operation to call this function!");
Value *LHS = CI->getOperand(CI->getNumOperands()-1);
// On 7XX and 8XX we do not have signed 24bit, so we need to
// expand it to the following:
// imul24 turns into 32bit imul
// imad24 turns into 32bit imad
// imul24_high turns into 32bit imulhigh
if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
Type *aType = CI->getOperand(0)->getType();
bool isVector = aType->isVectorTy();
int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
std::vector<Type*> callTypes;
callTypes.push_back(CI->getOperand(0)->getType());
callTypes.push_back(CI->getOperand(1)->getType());
callTypes.push_back(CI->getOperand(2)->getType());
FunctionType *funcType =
FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
std::string name = "__amdil_imad";
if (isVector) {
name += "_v" + itostr(numEle) + "i32";
} else {
name += "_i32";
}
Function *Func = dyn_cast<Function>(
CI->getParent()->getParent()->getParent()->
getOrInsertFunction(llvm::StringRef(name), funcType));
Value *Operands[3] = {
CI->getOperand(0),
CI->getOperand(1),
CI->getOperand(2)
};
CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
nCI->insertBefore(CI);
CI->replaceAllUsesWith(nCI);
} else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
BinaryOperator *mulOp =
BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
CI->getOperand(1), "imul24", CI);
CI->replaceAllUsesWith(mulOp);
} else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
Type *aType = CI->getOperand(0)->getType();
bool isVector = aType->isVectorTy();
int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
std::vector<Type*> callTypes;
callTypes.push_back(CI->getOperand(0)->getType());
callTypes.push_back(CI->getOperand(1)->getType());
FunctionType *funcType =
FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
std::string name = "__amdil_imul_high";
if (isVector) {
name += "_v" + itostr(numEle) + "i32";
} else {
name += "_i32";
}
Function *Func = dyn_cast<Function>(
CI->getParent()->getParent()->getParent()->
getOrInsertFunction(llvm::StringRef(name), funcType));
Value *Operands[2] = {
CI->getOperand(0),
CI->getOperand(1)
};
CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
nCI->insertBefore(CI);
CI->replaceAllUsesWith(nCI);
}
}
bool
AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
return (CI != NULL
&& CI->getOperand(CI->getNumOperands() - 1)->getName()
== "__amdil_get_local_size_int");
}
bool
AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
if (!CI) {
return false;
}
if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
&& (mSTM->getDeviceName() == "cayman")) {
return false;
}
return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
== "__amdil_improved_div";
}
void
AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
assert(convertAccurateDivide(CI)
&& "expanding accurate divide can only happen if it is expandable!");
BinaryOperator *divOp =
BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
CI->getOperand(1), "fdiv32", CI);
CI->replaceAllUsesWith(divOp);
}
bool
AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
if (optLevel != CodeGenOpt::None) {
return false;
}
if (!CI) {
return false;
}
unsigned funcNameIdx = 0;
funcNameIdx = CI->getNumOperands() - 1;
StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
if (calleeName != "__amdil_image2d_read_norm"
&& calleeName != "__amdil_image2d_read_unnorm"
&& calleeName != "__amdil_image3d_read_norm"
&& calleeName != "__amdil_image3d_read_unnorm") {
return false;
}
unsigned samplerIdx = 2;
samplerIdx = 1;
Value *sampler = CI->getOperand(samplerIdx);
LoadInst *lInst = dyn_cast<LoadInst>(sampler);
if (!lInst) {
return false;
}
if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return false;
}
GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
// If we are loading from what is not a global value, then we
// fail and return.
if (!gv) {
return false;
}
// If we don't have an initializer or we have an initializer and
// the initializer is not a 32bit integer, we fail.
if (!gv->hasInitializer()
|| !gv->getInitializer()->getType()->isIntegerTy(32)) {
return false;
}
// Now that we have the global variable initializer, lets replace
// all uses of the load instruction with the samplerVal and
// reparse the __amdil_is_constant() function.
Constant *samplerVal = gv->getInitializer();
lInst->replaceAllUsesWith(samplerVal);
return true;
}
bool
AMDGPUPeepholeOpt::doInitialization(Module &M) {
return false;
}
bool
AMDGPUPeepholeOpt::doFinalization(Module &M) {
return false;
}
void
AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<MachineFunctionAnalysis>();
FunctionPass::getAnalysisUsage(AU);
AU.setPreservesAll();
}
size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
size_t size = 0;
if (!T) {
return size;
}
switch (T->getTypeID()) {
case Type::X86_FP80TyID:
case Type::FP128TyID:
case Type::PPC_FP128TyID:
case Type::LabelTyID:
assert(0 && "These types are not supported by this backend");
default:
case Type::FloatTyID:
case Type::DoubleTyID:
size = T->getPrimitiveSizeInBits() >> 3;
break;
case Type::PointerTyID:
size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
break;
case Type::IntegerTyID:
size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
break;
case Type::StructTyID:
size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
break;
case Type::ArrayTyID:
size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
break;
case Type::FunctionTyID:
size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
break;
case Type::VectorTyID:
size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
break;
};
return size;
}
size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
bool dereferencePtr) {
size_t size = 0;
if (!ST) {
return size;
}
Type *curType;
StructType::element_iterator eib;
StructType::element_iterator eie;
for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
curType = *eib;
size += getTypeSize(curType, dereferencePtr);
}
return size;
}
size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
bool dereferencePtr) {
return IT ? (IT->getBitWidth() >> 3) : 0;
}
size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
bool dereferencePtr) {
assert(0 && "Should not be able to calculate the size of an function type");
return 0;
}
size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
bool dereferencePtr) {
return (size_t)(AT ? (getTypeSize(AT->getElementType(),
dereferencePtr) * AT->getNumElements())
: 0);
}
size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
bool dereferencePtr) {
return VT ? (VT->getBitWidth() >> 3) : 0;
}
size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
bool dereferencePtr) {
if (!PT) {
return 0;
}
Type *CT = PT->getElementType();
if (CT->getTypeID() == Type::StructTyID &&
PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
return getTypeSize(dyn_cast<StructType>(CT));
} else if (dereferencePtr) {
size_t size = 0;
for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
size += getTypeSize(PT->getContainedType(x), dereferencePtr);
}
return size;
} else {
return 4;
}
}
size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
bool dereferencePtr) {
//assert(0 && "Should not be able to calculate the size of an opaque type");
return 4;
}