//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass eliminates allocas by either converting them into vectors or // by migrating them to local address space. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "amdgpu-promote-alloca" using namespace llvm; namespace { class AMDGPUPromoteAlloca : public FunctionPass, public InstVisitor { static char ID; Module *Mod; const AMDGPUSubtarget &ST; int LocalMemAvailable; public: AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), LocalMemAvailable(0) { } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; const char *getPassName() const override { return "AMDGPU Promote Alloca"; } void visitAlloca(AllocaInst &I); }; } // End anonymous namespace char AMDGPUPromoteAlloca::ID = 0; bool AMDGPUPromoteAlloca::doInitialization(Module &M) { Mod = &M; return false; } bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { const FunctionType *FTy = F.getFunctionType(); LocalMemAvailable = ST.getLocalMemorySize(); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { const Type *ParamTy = FTy->getParamType(i); if (ParamTy->isPointerTy() && ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { LocalMemAvailable = 0; DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); break; } } if (LocalMemAvailable > 0) { // Check how much local memory is being used by global objects for (Module::global_iterator I = Mod->global_begin(), E = Mod->global_end(); I != E; ++I) { GlobalVariable *GV = I; PointerType *GVTy = GV->getType(); if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; for (Value::use_iterator U = GV->use_begin(), UE = GV->use_end(); U != UE; ++U) { Instruction *Use = dyn_cast(*U); if (!Use) continue; if (Use->getParent()->getParent() == &F) LocalMemAvailable -= Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); } } } LocalMemAvailable = std::max(0, LocalMemAvailable); DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); visit(F); return false; } static VectorType *arrayTypeToVecType(const Type *ArrayTy) { return VectorType::get(ArrayTy->getArrayElementType(), ArrayTy->getArrayNumElements()); } static Value * calculateVectorIndex(Value *Ptr, const std::map &GEPIdx) { if (isa(Ptr)) return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); GetElementPtrInst *GEP = cast(Ptr); auto I = GEPIdx.find(GEP); return I == GEPIdx.end() ? nullptr : I->second; } static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // FIXME we only support simple cases if (GEP->getNumOperands() != 3) return NULL; ConstantInt *I0 = dyn_cast(GEP->getOperand(1)); if (!I0 || !I0->isZero()) return NULL; return GEP->getOperand(2); } // Not an instruction handled below to turn into a vector. // // TODO: Check isTriviallyVectorizable for calls and handle other // instructions. static bool canVectorizeInst(Instruction *Inst) { switch (Inst->getOpcode()) { case Instruction::Load: case Instruction::Store: case Instruction::BitCast: case Instruction::AddrSpaceCast: return true; default: return false; } } static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { Type *AllocaTy = Alloca->getAllocatedType(); DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); // FIXME: There is no reason why we can't support larger arrays, we // are just being conservative for now. if (!AllocaTy->isArrayTy() || AllocaTy->getArrayElementType()->isVectorTy() || AllocaTy->getArrayNumElements() > 4) { DEBUG(dbgs() << " Cannot convert type to vector"); return false; } std::map GEPVectorIdx; std::vector WorkList; for (User *AllocaUser : Alloca->users()) { GetElementPtrInst *GEP = dyn_cast(AllocaUser); if (!GEP) { if (!canVectorizeInst(cast(AllocaUser))) return false; WorkList.push_back(AllocaUser); continue; } Value *Index = GEPToVectorIndex(GEP); // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. if (!Index) { DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); return false; } GEPVectorIdx[GEP] = Index; for (User *GEPUser : AllocaUser->users()) { if (!canVectorizeInst(cast(GEPUser))) return false; WorkList.push_back(GEPUser); } } VectorType *VectorTy = arrayTypeToVecType(AllocaTy); DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); for (std::vector::iterator I = WorkList.begin(), E = WorkList.end(); I != E; ++I) { Instruction *Inst = cast(*I); IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { Value *Ptr = Inst->getOperand(0); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); Value *VecValue = Builder.CreateLoad(BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); break; } case Instruction::Store: { Value *Ptr = Inst->getOperand(1); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); Value *VecValue = Builder.CreateLoad(BitCast); Value *NewVecValue = Builder.CreateInsertElement(VecValue, Inst->getOperand(0), Index); Builder.CreateStore(NewVecValue, BitCast); Inst->eraseFromParent(); break; } case Instruction::BitCast: case Instruction::AddrSpaceCast: break; default: Inst->dump(); llvm_unreachable("Inconsistency in instructions promotable to vector"); } } return true; } static void collectUsesWithPtrTypes(Value *Val, std::vector &WorkList) { for (User *User : Val->users()) { if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) continue; if (isa(User)) { WorkList.push_back(User); continue; } if (!User->getType()->isPointerTy()) continue; WorkList.push_back(User); collectUsesWithPtrTypes(User, WorkList); } } void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { IRBuilder<> Builder(&I); // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I)) return; DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); // FIXME: This is the maximum work group size. We should try to get // value from the reqd_work_group_size function attribute if it is // available. unsigned WorkGroupSize = 256; int AllocaSize = WorkGroupSize * Mod->getDataLayout()->getTypeAllocSize(AllocaTy); if (AllocaSize > LocalMemAvailable) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); return; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; GlobalVariable *GV = new GlobalVariable( *Mod, ArrayType::get(I.getAllocatedType(), 256), false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); FunctionType *FTy = FunctionType::get( Type::getInt32Ty(Mod->getContext()), false); AttributeSet AttrSet; AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); Value *ReadLocalSizeY = Mod->getOrInsertFunction( "llvm.r600.read.local.size.y", FTy, AttrSet); Value *ReadLocalSizeZ = Mod->getOrInsertFunction( "llvm.r600.read.local.size.z", FTy, AttrSet); Value *ReadTIDIGX = Mod->getOrInsertFunction( "llvm.r600.read.tidig.x", FTy, AttrSet); Value *ReadTIDIGY = Mod->getOrInsertFunction( "llvm.r600.read.tidig.y", FTy, AttrSet); Value *ReadTIDIGZ = Mod->getOrInsertFunction( "llvm.r600.read.tidig.z", FTy, AttrSet); Value *TCntY = Builder.CreateCall(ReadLocalSizeY); Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); Value *TIdX = Builder.CreateCall(ReadTIDIGX); Value *TIdY = Builder.CreateCall(ReadTIDIGY); Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); Tmp0 = Builder.CreateMul(Tmp0, TIdX); Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); std::vector Indices; Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); Indices.push_back(TID); Value *Offset = Builder.CreateGEP(GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); std::vector WorkList; collectUsesWithPtrTypes(Offset, WorkList); for (std::vector::iterator i = WorkList.begin(), e = WorkList.end(); i != e; ++i) { Value *V = *i; CallInst *Call = dyn_cast(V); if (!Call) { Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); // The operand's value should be corrected on its own. if (isa(V)) continue; // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); continue; } IntrinsicInst *Intr = dyn_cast(Call); if (!Intr) { std::vector ArgTypes; for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); ArgIdx != ArgEnd; ++ArgIdx) { ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); } Function *F = Call->getCalledFunction(); FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, F->isVarArg()); Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, F->getAttributes()); Function *NewF = cast(C); Call->setCalledFunction(NewF); continue; } Builder.SetInsertPoint(Intr); switch (Intr->getIntrinsicID()) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast(Intr); Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), MemCpy->getLength(), MemCpy->getAlignment(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), MemSet->getAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } default: Intr->dump(); llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } } FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { return new AMDGPUPromoteAlloca(ST); }