From 139bfee84c61fe5c20c80363a560e925ace68064 Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Fri, 9 Jan 2015 15:51:16 +0000 Subject: [PATCH] [PowerPC] Enable late partial unrolling on the POWER7 The P7 benefits from not have really-small loops so that we either have multiple dispatch groups in the loop and/or the ability to form more-full dispatch groups during scheduling. Setting the partial unrolling threshold to 44 seems good, empirically, for the P7. Compared to using no late partial unrolling, this yields the following test-suite speedups: SingleSource/Benchmarks/Adobe-C++/simple_types_constant_folding -66.3253% +/- 24.1975% SingleSource/Benchmarks/Misc-C++/oopack_v1p8 -44.0169% +/- 29.4881% SingleSource/Benchmarks/Misc/pi -27.8351% +/- 12.2712% SingleSource/Benchmarks/Stanford/Bubblesort -30.9898% +/- 22.4647% I've speculatively added a similar setting for the P8. Also, I've noticed that the unroller does not quite calculate the unrolling factor correctly for really tiny loops because it neglects to account for the fact that not every loop body replicant contains an ending branch and counter increment. I'll fix that later. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225522 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCScheduleP7.td | 3 ++ lib/Target/PowerPC/PPCScheduleP8.td | 3 ++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 2 + .../LoopUnroll/PowerPC/p7-unrolling.ll | 51 +++++++++++++++++++ 4 files changed, 59 insertions(+) create mode 100644 test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll diff --git a/lib/Target/PowerPC/PPCScheduleP7.td b/lib/Target/PowerPC/PPCScheduleP7.td index d3e426975ec..00aa834ca4b 100644 --- a/lib/Target/PowerPC/PPCScheduleP7.td +++ b/lib/Target/PowerPC/PPCScheduleP7.td @@ -380,6 +380,9 @@ def P7Model : SchedMachineModel { // Itineraries are queried instead. let MispredictPenalty = 16; + // Try to make sure we have at least 10-11 dispatch groups in a loop. + let LoopMicroOpBufferSize = 44; + let Itineraries = P7Itineraries; } diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td index 468dd937f8b..8cdc79e9d14 100644 --- a/lib/Target/PowerPC/PPCScheduleP8.td +++ b/lib/Target/PowerPC/PPCScheduleP8.td @@ -389,6 +389,9 @@ def P8Model : SchedMachineModel { // Itineraries are queried instead. let MispredictPenalty = 16; + // Try to make sure we have at least 10-11 dispatch groups in a loop. + let LoopMicroOpBufferSize = 66; + let Itineraries = P8Itineraries; } diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 37624ed93d3..fc440a56fdb 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -277,6 +277,8 @@ void PPCTTI::getUnrollingPreferences(const Function *F, Loop *L, // helps expose latency-hiding opportunities to the instruction scheduler. UP.Partial = UP.Runtime = true; } + + TargetTransformInfo::getUnrollingPreferences(F, L, UP); } unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { diff --git a/test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll b/test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll new file mode 100644 index 00000000000..b2ea74a75da --- /dev/null +++ b/test/Transforms/LoopUnroll/PowerPC/p7-unrolling.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -loop-unroll | FileCheck %s +define void @unroll_opt_for_size() nounwind optsize { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: @unroll_opt_for_size +; CHECK: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK: icmp + +define void @unroll_default() nounwind { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: @unroll_default +; CHECK: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK-NEXT: add +; CHECK: icmp +