llvm-6502/lib/Target/R600/AMDGPUPromoteAlloca.cpp
Mehdi Amini c94da20917 Make DataLayout Non-Optional in the Module
Summary:
DataLayout keeps the string used for its creation.

As a side effect it is no longer needed in the Module.
This is "almost" NFC, the string is no longer
canonicalized, you can't rely on two "equals" DataLayout
having the same string returned by getStringRepresentation().

Get rid of DataLayoutPass: the DataLayout is in the Module

The DataLayout is "per-module", let's enforce this by not
duplicating it more than necessary.
One more step toward non-optionality of the DataLayout in the
module.

Make DataLayout Non-Optional in the Module

Module->getDataLayout() will never returns nullptr anymore.

Reviewers: echristo

Subscribers: resistor, llvm-commits, jholewinski

Differential Revision: http://reviews.llvm.org/D7992

From: Mehdi Amini <mehdi.amini@apple.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231270 91177308-0d34-0410-b5e6-96231b3b80d8
2015-03-04 18:43:29 +00:00

408 lines
13 KiB
C++

//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass eliminates allocas by either converting them into vectors or
// by migrating them to local address space.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "amdgpu-promote-alloca"
using namespace llvm;
namespace {
class AMDGPUPromoteAlloca : public FunctionPass,
public InstVisitor<AMDGPUPromoteAlloca> {
static char ID;
Module *Mod;
const AMDGPUSubtarget &ST;
int LocalMemAvailable;
public:
AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
LocalMemAvailable(0) { }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
void visitAlloca(AllocaInst &I);
};
} // End anonymous namespace
char AMDGPUPromoteAlloca::ID = 0;
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
return false;
}
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
const FunctionType *FTy = F.getFunctionType();
LocalMemAvailable = ST.getLocalMemorySize();
// If the function has any arguments in the local address space, then it's
// possible these arguments require the entire local memory space, so
// we cannot use local memory in the pass.
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
const Type *ParamTy = FTy->getParamType(i);
if (ParamTy->isPointerTy() &&
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
LocalMemAvailable = 0;
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
"local memory disabled.\n");
break;
}
}
if (LocalMemAvailable > 0) {
// Check how much local memory is being used by global objects
for (Module::global_iterator I = Mod->global_begin(),
E = Mod->global_end(); I != E; ++I) {
GlobalVariable *GV = I;
PointerType *GVTy = GV->getType();
if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
for (Value::use_iterator U = GV->use_begin(),
UE = GV->use_end(); U != UE; ++U) {
Instruction *Use = dyn_cast<Instruction>(*U);
if (!Use)
continue;
if (Use->getParent()->getParent() == &F)
LocalMemAvailable -=
Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType());
}
}
}
LocalMemAvailable = std::max(0, LocalMemAvailable);
DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
visit(F);
return false;
}
static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
return VectorType::get(ArrayTy->getArrayElementType(),
ArrayTy->getArrayNumElements());
}
static Value *
calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
if (isa<AllocaInst>(Ptr))
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
auto I = GEPIdx.find(GEP);
return I == GEPIdx.end() ? nullptr : I->second;
}
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
// FIXME we only support simple cases
if (GEP->getNumOperands() != 3)
return NULL;
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
if (!I0 || !I0->isZero())
return NULL;
return GEP->getOperand(2);
}
// Not an instruction handled below to turn into a vector.
//
// TODO: Check isTriviallyVectorizable for calls and handle other
// instructions.
static bool canVectorizeInst(Instruction *Inst) {
switch (Inst->getOpcode()) {
case Instruction::Load:
case Instruction::Store:
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
return true;
default:
return false;
}
}
static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Type *AllocaTy = Alloca->getAllocatedType();
DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
if (!AllocaTy->isArrayTy() ||
AllocaTy->getArrayElementType()->isVectorTy() ||
AllocaTy->getArrayNumElements() > 4) {
DEBUG(dbgs() << " Cannot convert type to vector");
return false;
}
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
std::vector<Value*> WorkList;
for (User *AllocaUser : Alloca->users()) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
if (!GEP) {
if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
return false;
WorkList.push_back(AllocaUser);
continue;
}
Value *Index = GEPToVectorIndex(GEP);
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
if (!Index) {
DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
return false;
}
GEPVectorIdx[GEP] = Index;
for (User *GEPUser : AllocaUser->users()) {
if (!canVectorizeInst(cast<Instruction>(GEPUser)))
return false;
WorkList.push_back(GEPUser);
}
}
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
DEBUG(dbgs() << " Converting alloca to vector "
<< *AllocaTy << " -> " << *VectorTy << '\n');
for (std::vector<Value*>::iterator I = WorkList.begin(),
E = WorkList.end(); I != E; ++I) {
Instruction *Inst = cast<Instruction>(*I);
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
Value *Ptr = Inst->getOperand(0);
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
Value *VecValue = Builder.CreateLoad(BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();
break;
}
case Instruction::Store: {
Value *Ptr = Inst->getOperand(1);
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
Value *VecValue = Builder.CreateLoad(BitCast);
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
Inst->getOperand(0),
Index);
Builder.CreateStore(NewVecValue, BitCast);
Inst->eraseFromParent();
break;
}
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
break;
default:
Inst->dump();
llvm_unreachable("Inconsistency in instructions promotable to vector");
}
}
return true;
}
static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
bool Success = true;
for (User *User : Val->users()) {
if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
continue;
if (isa<CallInst>(User)) {
WorkList.push_back(User);
continue;
}
// FIXME: Correctly handle ptrtoint instructions.
Instruction *UseInst = dyn_cast<Instruction>(User);
if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
return false;
if (!User->getType()->isPointerTy())
continue;
WorkList.push_back(User);
Success &= collectUsesWithPtrTypes(User, WorkList);
}
return Success;
}
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
IRBuilder<> Builder(&I);
// First try to replace the alloca with a vector
Type *AllocaTy = I.getAllocatedType();
DEBUG(dbgs() << "Trying to promote " << I << '\n');
if (tryPromoteAllocaToVector(&I))
return;
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
// FIXME: This is the maximum work group size. We should try to get
// value from the reqd_work_group_size function attribute if it is
// available.
unsigned WorkGroupSize = 256;
int AllocaSize =
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
if (AllocaSize > LocalMemAvailable) {
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
return;
}
std::vector<Value*> WorkList;
if (!collectUsesWithPtrTypes(&I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");
return;
}
DEBUG(dbgs() << "Promoting alloca to local memory\n");
LocalMemAvailable -= AllocaSize;
GlobalVariable *GV = new GlobalVariable(
*Mod, ArrayType::get(I.getAllocatedType(), 256), false,
GlobalValue::ExternalLinkage, 0, I.getName(), 0,
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
FunctionType *FTy = FunctionType::get(
Type::getInt32Ty(Mod->getContext()), false);
AttributeSet AttrSet;
AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
Value *ReadLocalSizeY = Mod->getOrInsertFunction(
"llvm.r600.read.local.size.y", FTy, AttrSet);
Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
"llvm.r600.read.local.size.z", FTy, AttrSet);
Value *ReadTIDIGX = Mod->getOrInsertFunction(
"llvm.r600.read.tidig.x", FTy, AttrSet);
Value *ReadTIDIGY = Mod->getOrInsertFunction(
"llvm.r600.read.tidig.y", FTy, AttrSet);
Value *ReadTIDIGZ = Mod->getOrInsertFunction(
"llvm.r600.read.tidig.z", FTy, AttrSet);
Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
Value *TIdX = Builder.CreateCall(ReadTIDIGX);
Value *TIdY = Builder.CreateCall(ReadTIDIGY);
Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
TID = Builder.CreateAdd(TID, TIdZ);
std::vector<Value*> Indices;
Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
Indices.push_back(TID);
Value *Offset = Builder.CreateGEP(GV, Indices);
I.mutateType(Offset->getType());
I.replaceAllUsesWith(Offset);
I.eraseFromParent();
for (std::vector<Value*>::iterator i = WorkList.begin(),
e = WorkList.end(); i != e; ++i) {
Value *V = *i;
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
Type *EltTy = V->getType()->getPointerElementType();
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
// The operand's value should be corrected on its own.
if (isa<AddrSpaceCastInst>(V))
continue;
// FIXME: It doesn't really make sense to try to do this for all
// instructions.
V->mutateType(NewTy);
continue;
}
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
if (!Intr) {
std::vector<Type*> ArgTypes;
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
ArgIdx != ArgEnd; ++ArgIdx) {
ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
}
Function *F = Call->getCalledFunction();
FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
F->isVarArg());
Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
F->getAttributes());
Function *NewF = cast<Function>(C);
Call->setCalledFunction(NewF);
continue;
}
Builder.SetInsertPoint(Intr);
switch (Intr->getIntrinsicID()) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
// These intrinsics are for address space 0 only
Intr->eraseFromParent();
continue;
case Intrinsic::memcpy: {
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
MemCpy->getLength(), MemCpy->getAlignment(),
MemCpy->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
MemSet->getLength(), MemSet->getAlignment(),
MemSet->isVolatile());
Intr->eraseFromParent();
continue;
}
default:
Intr->dump();
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
}
FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
return new AMDGPUPromoteAlloca(ST);
}