llvm-6502/lib/Target/R600/AMDGPUPromoteAlloca.cpp
David Blaikie 4e933df738 Opaque Pointer Types: GEP API migrations to specify the gep type explicitly
The changes to InstCombine (& SCEV) do seem a bit silly - it doesn't make
anything obviously better to have the caller access the pointers element
type (the thing I'm trying to remove) than the GEP itself, but it's a
helpful migration step. This will allow me to more obviously lock down
GEP (& Load, etc) API usage, then fix all the code that accesses pointer
element types except the places that need to be removed (most of the
InstCombines) anyway - at which point I'll need to just remove all that
code because it won't be meaningful anymore (there will be no pointer
types, so no bitcasts to combine)

SCEV looks like it'll need some restructuring - we'll have to do a bit
more work for GEP canonicalization, since it'll depend on how it's used
if we can even manage to canonicalize it to a non-ugly GEP. I guess we
can do some fun stuff like voting (do 2 out of 3 load from the GEP with
a certain type that gives a pretty GEP? Does every typed use of the GEP
use either a specific type or a generic type (i8*, etc)?)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233131 91177308-0d34-0410-b5e6-96231b3b80d8
2015-03-24 23:34:31 +00:00

409 lines
13 KiB
C++

//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass eliminates allocas by either converting them into vectors or
// by migrating them to local address space.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "amdgpu-promote-alloca"
using namespace llvm;
namespace {
class AMDGPUPromoteAlloca : public FunctionPass,
public InstVisitor<AMDGPUPromoteAlloca> {
static char ID;
Module *Mod;
const AMDGPUSubtarget &ST;
int LocalMemAvailable;
public:
AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
LocalMemAvailable(0) { }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
void visitAlloca(AllocaInst &I);
};
} // End anonymous namespace
char AMDGPUPromoteAlloca::ID = 0;
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
return false;
}
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
const FunctionType *FTy = F.getFunctionType();
LocalMemAvailable = ST.getLocalMemorySize();
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
// we cannot use local memory in the pass.
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
const Type *ParamTy = FTy->getParamType(i);
if (ParamTy->isPointerTy() &&
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
LocalMemAvailable = 0;
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
"local memory disabled.\n");
break;
}
}
if (LocalMemAvailable > 0) {
// Check how much local memory is being used by global objects
for (Module::global_iterator I = Mod->global_begin(),
E = Mod->global_end(); I != E; ++I) {
GlobalVariable *GV = I;
PointerType *GVTy = GV->getType();
if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
for (Value::use_iterator U = GV->use_begin(),
UE = GV->use_end(); U != UE; ++U) {
Instruction *Use = dyn_cast<Instruction>(*U);
if (!Use)
continue;
if (Use->getParent()->getParent() == &F)
LocalMemAvailable -=
Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
}
}
}
LocalMemAvailable = std::max(0, LocalMemAvailable);
DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
visit(F);
return false;
}
static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
return VectorType::get(ArrayTy->getArrayElementType(),
ArrayTy->getArrayNumElements());
}
static Value *
calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
if (isa<AllocaInst>(Ptr))
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
auto I = GEPIdx.find(GEP);
return I == GEPIdx.end() ? nullptr : I->second;
}
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
// FIXME we only support simple cases
if (GEP->getNumOperands() != 3)
return NULL;
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
if (!I0 || !I0->isZero())
return NULL;
return GEP->getOperand(2);
}
// Not an instruction handled below to turn into a vector.
//
// TODO: Check isTriviallyVectorizable for calls and handle other
// instructions.
static bool canVectorizeInst(Instruction *Inst) {
switch (Inst->getOpcode()) {
case Instruction::Load:
case Instruction::Store:
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
return true;
default:
return false;
}
}
static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Type *AllocaTy = Alloca->getAllocatedType();
DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
if (!AllocaTy->isArrayTy() ||
AllocaTy->getArrayElementType()->isVectorTy() ||
AllocaTy->getArrayNumElements() > 4) {
DEBUG(dbgs() << " Cannot convert type to vector");
return false;
}
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
std::vector<Value*> WorkList;
for (User *AllocaUser : Alloca->users()) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
if (!GEP) {
if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
return false;
WorkList.push_back(AllocaUser);
continue;
}
Value *Index = GEPToVectorIndex(GEP);
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
if (!Index) {
DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
return false;
}
GEPVectorIdx[GEP] = Index;
for (User *GEPUser : AllocaUser->users()) {
if (!canVectorizeInst(cast<Instruction>(GEPUser)))
return false;
WorkList.push_back(GEPUser);
}
}
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
DEBUG(dbgs() << " Converting alloca to vector "
<< *AllocaTy << " -> " << *VectorTy << '\n');
for (std::vector<Value*>::iterator I = WorkList.begin(),
E = WorkList.end(); I != E; ++I) {
Instruction *Inst = cast<Instruction>(*I);
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
Value *Ptr = Inst->getOperand(0);
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
Value *VecValue = Builder.CreateLoad(BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();
break;
}
case Instruction::Store: {
Value *Ptr = Inst->getOperand(1);
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
Value *VecValue = Builder.CreateLoad(BitCast);
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
Inst->getOperand(0),
Index);
Builder.CreateStore(NewVecValue, BitCast);
Inst->eraseFromParent();
break;
}
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
break;
default:
Inst->dump();
llvm_unreachable("Inconsistency in instructions promotable to vector");
}
}
return true;
}
static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
bool Success = true;
for (User *User : Val->users()) {
if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
continue;
if (isa<CallInst>(User)) {
WorkList.push_back(User);
continue;
}
// FIXME: Correctly handle ptrtoint instructions.
Instruction *UseInst = dyn_cast<Instruction>(User);
if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
return false;
if (!User->getType()->isPointerTy())
continue;
WorkList.push_back(User);
Success &= collectUsesWithPtrTypes(User, WorkList);
}
return Success;
}
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
IRBuilder<> Builder(&I);
// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();
DEBUG(dbgs() << "Trying to promote " << I << '\n');
if (tryPromoteAllocaToVector(&I))
return;
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
// FIXME: This is the maximum work group size. We should try to get
// value from the reqd_work_group_size function attribute if it is
// available.
unsigned WorkGroupSize = 256;
int AllocaSize =
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
if (AllocaSize > LocalMemAvailable) {
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
return;
}
std::vector<Value*> WorkList;
if (!collectUsesWithPtrTypes(&I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");
return;
}
DEBUG(dbgs() << "Promoting alloca to local memory\n");
LocalMemAvailable -= AllocaSize;
Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
GlobalVariable *GV = new GlobalVariable(
*Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
FunctionType *FTy = FunctionType::get(
Type::getInt32Ty(Mod->getContext()), false);
AttributeSet AttrSet;
AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
Value *ReadLocalSizeY = Mod->getOrInsertFunction(
"llvm.r600.read.local.size.y", FTy, AttrSet);
Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
"llvm.r600.read.local.size.z", FTy, AttrSet);
Value *ReadTIDIGX = Mod->getOrInsertFunction(
"llvm.r600.read.tidig.x", FTy, AttrSet);
Value *ReadTIDIGY = Mod->getOrInsertFunction(
"llvm.r600.read.tidig.y", FTy, AttrSet);
Value *ReadTIDIGZ = Mod->getOrInsertFunction(
"llvm.r600.read.tidig.z", FTy, AttrSet);
Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
Value *TIdX = Builder.CreateCall(ReadTIDIGX);
Value *TIdY = Builder.CreateCall(ReadTIDIGY);
Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
TID = Builder.CreateAdd(TID, TIdZ);
std::vector<Value*> Indices;
Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
Indices.push_back(TID);
Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
I.mutateType(Offset->getType());
I.replaceAllUsesWith(Offset);
I.eraseFromParent();
for (std::vector<Value*>::iterator i = WorkList.begin(),
e = WorkList.end(); i != e; ++i) {
Value *V = *i;
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
Type *EltTy = V->getType()->getPointerElementType();
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
// The operand's value should be corrected on its own.
if (isa<AddrSpaceCastInst>(V))
continue;
// FIXME: It doesn't really make sense to try to do this for all
// instructions.
V->mutateType(NewTy);
continue;
}
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
if (!Intr) {
std::vector<Type*> ArgTypes;
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
ArgIdx != ArgEnd; ++ArgIdx) {
ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
}
Function *F = Call->getCalledFunction();
FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
F->isVarArg());
Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
F->getAttributes());
Function *NewF = cast<Function>(C);
Call->setCalledFunction(NewF);
continue;
}
Builder.SetInsertPoint(Intr);
switch (Intr->getIntrinsicID()) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
// These intrinsics are for address space 0 only
Intr->eraseFromParent();
continue;
case Intrinsic::memcpy: {
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
MemCpy->getLength(), MemCpy->getAlignment(),
MemCpy->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
MemSet->getLength(), MemSet->getAlignment(),
MemSet->isVolatile());
Intr->eraseFromParent();
continue;
}
default:
Intr->dump();
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
}
FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
return new AMDGPUPromoteAlloca(ST);
}