diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index ca1e0b65289..a4feec71312 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -18,7 +18,9 @@ #define DEBUG_TYPE "AMDGPUtti" #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -73,6 +75,8 @@ public: virtual bool hasBranchDivergence() const; + virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const; + /// @} }; @@ -88,3 +92,28 @@ llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { } bool AMDGPUTTI::hasBranchDivergence() const { return true; } + +void AMDGPUTTI::getUnrollingPreferences(Loop *L, + UnrollingPreferences &UP) const { + for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end(); + BI != BE; ++BI) { + BasicBlock *BB = *BI; + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); + I != E; ++I) { + const GetElementPtrInst *GEP = dyn_cast(I); + if (!GEP) + continue; + const Value *Ptr = GEP->getPointerOperand(); + const AllocaInst *Alloca = dyn_cast(GetUnderlyingObject(Ptr)); + if (Alloca) { + // We want to do whatever we can to limit the number of alloca + // instructions that make it through to the code generator. allocas + // require us to use indirect addressing, which is slow and prone to + // compiler bugs. If this loop does an address calculation on an + // alloca ptr, then we want to unconditionally unroll the loop. In most + // cases, this will make it possible for SROA to eliminate these allocas. + UP.Threshold = UINT_MAX; + } + } + } +} diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll new file mode 100644 index 00000000000..e0035eae71c --- /dev/null +++ b/test/CodeGen/R600/unroll.ll @@ -0,0 +1,37 @@ +; RUN: opt -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" +target triple = "r600--" + +; This test contains a simple loop that initializes an array declared in +; private memory. We want to make sure these kinds of loops are always +; unrolled, because private memory is slow. + +; CHECK-LABEL: @test +; CHECK-NOT: alloca +; CHECK: store i32 5, i32 addrspace(1)* %out +define void @test(i32 addrspace(1)* %out) { +entry: + %0 = alloca [32 x i32] + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr = getelementptr [32 x i32]* %0, i32 0, i32 %counter + store i32 %counter, i32* %ptr + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %1 = icmp sge i32 %counter, 32 + br i1 %1, label %exit, label %loop.header + +exit: + %2 = getelementptr [32 x i32]* %0, i32 0, i32 5 + %3 = load i32* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +}