From 40e66277f7daa2f5ce6f4f62b91c1a17e50df98c Mon Sep 17 00:00:00 2001 From: Kevin Qin Date: Mon, 9 Mar 2015 06:14:28 +0000 Subject: [PATCH] [AArch64] Enable partial & runtime unrolling on cortex-a57 For inner one of nested loops, it is more likely to be a hot loop, and the runtime check can be promoted out from patch 0001, so the overhead is less, we can try a doubled threshold to unroll more loops. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231632 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/AArch64TargetTransformInfo.cpp | 10 +++ .../LoopUnroll/AArch64/lit.local.cfg | 3 + test/Transforms/LoopUnroll/AArch64/partial.ll | 76 +++++++++++++++++++ .../LoopUnroll/AArch64/runtime-loop.ll | 33 ++++++++ 4 files changed, 122 insertions(+) create mode 100644 test/Transforms/LoopUnroll/AArch64/lit.local.cfg create mode 100644 test/Transforms/LoopUnroll/AArch64/partial.ll create mode 100644 test/Transforms/LoopUnroll/AArch64/runtime-loop.ll diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 39af0472781..0533355b01d 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -10,6 +10,7 @@ #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" @@ -426,6 +427,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor() { void AArch64TTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { + // Enable partial unrolling and runtime unrolling. + BaseT::getUnrollingPreferences(L, UP); + + // For inner loop, it is more likely to be a hot one, and the runtime check + // can be promoted out from LICM pass, so the overhead is less, let's try + // a larger threshold to unroll more loops. + if (L->getLoopDepth() > 1) + UP.PartialThreshold *= 2; + // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; } diff --git a/test/Transforms/LoopUnroll/AArch64/lit.local.cfg b/test/Transforms/LoopUnroll/AArch64/lit.local.cfg new file mode 100644 index 00000000000..cec29af5bbe --- /dev/null +++ b/test/Transforms/LoopUnroll/AArch64/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/LoopUnroll/AArch64/partial.ll b/test/Transforms/LoopUnroll/AArch64/partial.ll new file mode 100644 index 00000000000..8a1ea80c9d5 --- /dev/null +++ b/test/Transforms/LoopUnroll/AArch64/partial.ll @@ -0,0 +1,76 @@ +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s + +; Partial unroll 8 times for this loop. +define void @unroll1() nounwind { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + +; CHECK: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: icmp + +; Partial unroll 16 times for this loop. +define void @unroll2() nounwind { +entry: + br label %loop1 + +loop1: + %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] + br label %loop2.header + +loop2.header: + br label %loop2 + +loop2: + %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ] + %inc2 = add i32 %iv2, 1 + %exitcnd2 = icmp uge i32 %inc2, 1024 + br i1 %exitcnd2, label %exit2, label %loop2 + +exit2: + br label %loop1.latch + +loop1.latch: + %inc1 = add i32 %iv1, 1 + %exitcnd1 = icmp uge i32 %inc1, 1024 + br i1 %exitcnd2, label %exit, label %loop1 + +exit: + ret void +} + + + +; CHECK: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: icmp diff --git a/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll b/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll new file mode 100644 index 00000000000..d3dc081fa6f --- /dev/null +++ b/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll @@ -0,0 +1,33 @@ +; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s + +; Tests for unrolling loops with run-time trip counts + +; CHECK: %xtraiter = and i32 %n +; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0 +; CHECK: br i1 %lcmp.mod, label %for.body.prol, label %for.body.preheader.split + +; CHECK: for.body.prol: +; CHECK: for.body: + +define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.02 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %sum.0.lcssa +} + +