Teach LSR how to cope better with unrolled loops on targets where

the addressing modes don't make this trivially easy. This allows
it to avoid falling into the less precise heuristics in more
cases.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@104186 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Dan Gohman 2010-05-19 23:43:12 +00:00
parent 492fd454ca
commit a2086b3483
2 changed files with 577 additions and 3 deletions

View File

@ -113,6 +113,7 @@ class RegUseTracker {
public: public:
void CountRegister(const SCEV *Reg, size_t LUIdx); void CountRegister(const SCEV *Reg, size_t LUIdx);
void DropRegister(const SCEV *Reg, size_t LUIdx); void DropRegister(const SCEV *Reg, size_t LUIdx);
void DropUse(size_t LUIdx);
bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
@ -150,6 +151,14 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
RSD.UsedByIndices.reset(LUIdx); RSD.UsedByIndices.reset(LUIdx);
} }
void
RegUseTracker::DropUse(size_t LUIdx) {
// Remove the use index from every register's use list.
for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
I != E; ++I)
I->second.UsedByIndices.reset(LUIdx);
}
bool bool
RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
if (!RegUsesMap.count(Reg)) return false; if (!RegUsesMap.count(Reg)) return false;
@ -951,6 +960,7 @@ public:
MaxOffset(INT64_MIN), MaxOffset(INT64_MIN),
AllFixupsOutsideLoop(true) {} AllFixupsOutsideLoop(true) {}
bool HasFormulaWithSameRegs(const Formula &F) const;
bool InsertFormula(const Formula &F); bool InsertFormula(const Formula &F);
void DeleteFormula(Formula &F); void DeleteFormula(Formula &F);
void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
@ -961,6 +971,16 @@ public:
void dump() const; void dump() const;
}; };
/// HasFormula - Test whether this use as a formula which has the same
/// registers as the given formula.
bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
SmallVector<const SCEV *, 2> Key = F.BaseRegs;
if (F.ScaledReg) Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for uniquifying.
std::sort(Key.begin(), Key.end());
return Uniquifier.count(Key);
}
/// InsertFormula - If the given formula has not yet been inserted, add it to /// InsertFormula - If the given formula has not yet been inserted, add it to
/// the list, and return true. Return false otherwise. /// the list, and return true. Return false otherwise.
bool LSRUse::InsertFormula(const Formula &F) { bool LSRUse::InsertFormula(const Formula &F) {
@ -995,6 +1015,7 @@ bool LSRUse::InsertFormula(const Formula &F) {
void LSRUse::DeleteFormula(Formula &F) { void LSRUse::DeleteFormula(Formula &F) {
std::swap(F, Formulae.back()); std::swap(F, Formulae.back());
Formulae.pop_back(); Formulae.pop_back();
assert(!Formulae.empty() && "LSRUse has no formulae left!");
} }
/// RecomputeRegs - Recompute the Regs field, and update RegUses. /// RecomputeRegs - Recompute the Regs field, and update RegUses.
@ -1134,6 +1155,13 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
AM.HasBaseReg = HasBaseReg; AM.HasBaseReg = HasBaseReg;
AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
// Canonicalize a scale of 1 to a base register if the formula doesn't
// already have a base register.
if (!AM.HasBaseReg && AM.Scale == 1) {
AM.Scale = 0;
AM.HasBaseReg = true;
}
return isLegalUse(AM, Kind, AccessTy, TLI); return isLegalUse(AM, Kind, AccessTy, TLI);
} }
@ -1244,12 +1272,15 @@ class LSRInstance {
UseMapTy UseMap; UseMapTy UseMap;
bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
bool HasBaseReg,
LSRUse::KindType Kind, const Type *AccessTy); LSRUse::KindType Kind, const Type *AccessTy);
std::pair<size_t, int64_t> getUse(const SCEV *&Expr, std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
LSRUse::KindType Kind, LSRUse::KindType Kind,
const Type *AccessTy); const Type *AccessTy);
LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
public: public:
void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
@ -1742,6 +1773,7 @@ LSRInstance::OptimizeLoopTermCond() {
bool bool
LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
bool HasBaseReg,
LSRUse::KindType Kind, const Type *AccessTy) { LSRUse::KindType Kind, const Type *AccessTy) {
int64_t NewMinOffset = LU.MinOffset; int64_t NewMinOffset = LU.MinOffset;
int64_t NewMaxOffset = LU.MaxOffset; int64_t NewMaxOffset = LU.MaxOffset;
@ -1754,12 +1786,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
return false; return false;
// Conservatively assume HasBaseReg is true for now. // Conservatively assume HasBaseReg is true for now.
if (NewOffset < LU.MinOffset) { if (NewOffset < LU.MinOffset) {
if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true, if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
Kind, AccessTy, TLI)) Kind, AccessTy, TLI))
return false; return false;
NewMinOffset = NewOffset; NewMinOffset = NewOffset;
} else if (NewOffset > LU.MaxOffset) { } else if (NewOffset > LU.MaxOffset) {
if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true, if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
Kind, AccessTy, TLI)) Kind, AccessTy, TLI))
return false; return false;
NewMaxOffset = NewOffset; NewMaxOffset = NewOffset;
@ -1798,7 +1830,7 @@ LSRInstance::getUse(const SCEV *&Expr,
// A use already existed with this base. // A use already existed with this base.
size_t LUIdx = P.first->second; size_t LUIdx = P.first->second;
LSRUse &LU = Uses[LUIdx]; LSRUse &LU = Uses[LUIdx];
if (reconcileNewOffset(LU, Offset, Kind, AccessTy)) if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
// Reuse this use. // Reuse this use.
return std::make_pair(LUIdx, Offset); return std::make_pair(LUIdx, Offset);
} }
@ -1819,6 +1851,40 @@ LSRInstance::getUse(const SCEV *&Expr,
return std::make_pair(LUIdx, Offset); return std::make_pair(LUIdx, Offset);
} }
/// FindUseWithFormula - Look for a use distinct from OrigLU which is has
/// a formula that has the same registers as the given formula.
LSRUse *
LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
const LSRUse &OrigLU) {
// Search all uses for the formula. This could be more clever. Ignore
// ICmpZero uses because they may contain formulae generated by
// GenerateICmpZeroScales, in which case adding fixup offsets may
// be invalid.
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
if (&LU != &OrigLU &&
LU.Kind != LSRUse::ICmpZero &&
LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
LU.HasFormulaWithSameRegs(OrigF)) {
for (size_t FIdx = 0, NumForms = LU.Formulae.size();
FIdx != NumForms; ++FIdx) {
Formula &F = LU.Formulae[FIdx];
if (F.BaseRegs == OrigF.BaseRegs &&
F.ScaledReg == OrigF.ScaledReg &&
F.AM.BaseGV == OrigF.AM.BaseGV &&
F.AM.Scale == OrigF.AM.Scale &&
LU.Kind) {
if (F.AM.BaseOffs == 0)
return &LU;
break;
}
}
}
}
return 0;
}
void LSRInstance::CollectInterestingTypesAndFactors() { void LSRInstance::CollectInterestingTypesAndFactors() {
SmallSetVector<const SCEV *, 4> Strides; SmallSetVector<const SCEV *, 4> Strides;
@ -2722,6 +2788,128 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
/// of formulae. This keeps the main solver from taking an extraordinary amount /// of formulae. This keeps the main solver from taking an extraordinary amount
/// of time in some worst-case scenarios. /// of time in some worst-case scenarios.
void LSRInstance::NarrowSearchSpaceUsingHeuristics() { void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
DEBUG(dbgs() << "The search space is too complex.\n");
DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
"which use a superset of registers used by other "
"formulae.\n");
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
bool Any = false;
for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
Formula &F = LU.Formulae[i];
for (SmallVectorImpl<const SCEV *>::const_iterator
I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
Formula NewF = F;
NewF.AM.BaseOffs += C->getValue()->getSExtValue();
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
LU.DeleteFormula(F);
--i;
--e;
Any = true;
break;
}
} else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
if (!F.AM.BaseGV) {
Formula NewF = F;
NewF.AM.BaseGV = GV;
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
DEBUG(dbgs() << " Deleting "; F.print(dbgs());
dbgs() << '\n');
LU.DeleteFormula(F);
--i;
--e;
Any = true;
break;
}
}
}
}
}
if (Any)
LU.RecomputeRegs(LUIdx, RegUses);
}
DEBUG(dbgs() << "After pre-selection:\n";
print_uses(dbgs()));
}
if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
DEBUG(dbgs() << "The search space is too complex.\n");
DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
"separated by a constant offset will use the same "
"registers.\n");
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
Formula &F = LU.Formulae[i];
if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
/*HasBaseReg=*/false,
LU.Kind, LU.AccessTy)) {
DEBUG(dbgs() << " Deleting use "; LU.print(dbgs());
dbgs() << '\n');
LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
// Delete formulae from the new use which are no longer legal.
bool Any = false;
for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
Formula &F = LUThatHas->Formulae[i];
if (!isLegalUse(F.AM,
LUThatHas->MinOffset, LUThatHas->MaxOffset,
LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
DEBUG(dbgs() << " Deleting "; F.print(dbgs());
dbgs() << '\n');
LUThatHas->DeleteFormula(F);
--i;
--e;
Any = true;
}
}
if (Any)
LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
// Update the relocs to reference the new use.
for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
if (Fixups[i].LUIdx == LUIdx) {
Fixups[i].LUIdx = LUThatHas - &Uses.front();
Fixups[i].Offset += F.AM.BaseOffs;
DEBUG(errs() << "New fixup has offset "
<< Fixups[i].Offset << "\n");
}
if (Fixups[i].LUIdx == NumUses-1)
Fixups[i].LUIdx = LUIdx;
}
// Delete the old use.
std::swap(LU, Uses.back());
Uses.pop_back();
--LUIdx;
--NumUses;
break;
}
}
}
}
}
DEBUG(dbgs() << "After pre-selection:\n";
print_uses(dbgs()));
}
SmallPtrSet<const SCEV *, 4> Taken; SmallPtrSet<const SCEV *, 4> Taken;
while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
// Ok, we have too many of formulae on our hands to conveniently handle. // Ok, we have too many of formulae on our hands to conveniently handle.

View File

@ -0,0 +1,386 @@
; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
; LSR should recognize that this is an unrolled loop which can use
; constant offset addressing, so that each of the following stores
; uses the same register.
; CHECK: vstr.32 s0, [r12, #-128]
; CHECK: vstr.32 s0, [r12, #-96]
; CHECK: vstr.32 s0, [r12, #-64]
; CHECK: vstr.32 s0, [r12, #-32]
; CHECK: vstr.32 s0, [r12]
; CHECK: vstr.32 s0, [r12, #32]
; CHECK: vstr.32 s0, [r12, #64]
; CHECK: vstr.32 s0, [r12, #96]
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 }
%2 = type { %1*, %3*, %6*, i8*, i32, i32 }
%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 }
%4 = type opaque
%5 = type opaque
%6 = type { void (%2*)*, i32, i32, i32, i32 }
%7 = type { [8 x i32], [12 x i32] }
%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* }
%9 = type { [64 x i16], i32 }
%10 = type { [17 x i8], [256 x i8], i32 }
%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* }
%12 = type { %12*, i8, i32, i32, i8* }
%13 = type { void (%0*)*, void (%0*)*, i32 }
%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* }
%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** }
%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* }
%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 }
%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 }
%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 }
%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] }
%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 }
%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* }
%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* }
define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind {
bb:
%t = alloca [64 x float], align 4
%t5 = getelementptr inbounds %0* %a0, i32 0, i32 65
%t6 = load i8** %t5, align 4
%t7 = getelementptr inbounds %11* %a1, i32 0, i32 20
%t8 = load i8** %t7, align 4
br label %bb9
bb9:
%t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ]
%t11 = add i32 %t10, 8
%t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11
%t13 = add i32 %t10, 16
%t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13
%t15 = add i32 %t10, 24
%t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15
%t17 = add i32 %t10, 32
%t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17
%t19 = add i32 %t10, 40
%t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19
%t21 = add i32 %t10, 48
%t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21
%t23 = add i32 %t10, 56
%t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23
%t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10
%t26 = shl i32 %t10, 5
%t27 = or i32 %t26, 8
%t28 = getelementptr i8* %t8, i32 %t27
%t29 = bitcast i8* %t28 to float*
%t30 = or i32 %t26, 16
%t31 = getelementptr i8* %t8, i32 %t30
%t32 = bitcast i8* %t31 to float*
%t33 = or i32 %t26, 24
%t34 = getelementptr i8* %t8, i32 %t33
%t35 = bitcast i8* %t34 to float*
%t36 = or i32 %t26, 4
%t37 = getelementptr i8* %t8, i32 %t36
%t38 = bitcast i8* %t37 to float*
%t39 = or i32 %t26, 12
%t40 = getelementptr i8* %t8, i32 %t39
%t41 = bitcast i8* %t40 to float*
%t42 = or i32 %t26, 20
%t43 = getelementptr i8* %t8, i32 %t42
%t44 = bitcast i8* %t43 to float*
%t45 = or i32 %t26, 28
%t46 = getelementptr i8* %t8, i32 %t45
%t47 = bitcast i8* %t46 to float*
%t48 = getelementptr i8* %t8, i32 %t26
%t49 = bitcast i8* %t48 to float*
%t50 = shl i32 %t10, 3
%t51 = or i32 %t50, 1
%t52 = getelementptr i16* %a2, i32 %t51
%t53 = or i32 %t50, 2
%t54 = getelementptr i16* %a2, i32 %t53
%t55 = or i32 %t50, 3
%t56 = getelementptr i16* %a2, i32 %t55
%t57 = or i32 %t50, 4
%t58 = getelementptr i16* %a2, i32 %t57
%t59 = or i32 %t50, 5
%t60 = getelementptr i16* %a2, i32 %t59
%t61 = or i32 %t50, 6
%t62 = getelementptr i16* %a2, i32 %t61
%t63 = or i32 %t50, 7
%t64 = getelementptr i16* %a2, i32 %t63
%t65 = getelementptr i16* %a2, i32 %t50
%t66 = load i16* %t52, align 2
%t67 = icmp eq i16 %t66, 0
%t68 = load i16* %t54, align 2
%t69 = icmp eq i16 %t68, 0
%t70 = and i1 %t67, %t69
br i1 %t70, label %bb71, label %bb91
bb71:
%t72 = load i16* %t56, align 2
%t73 = icmp eq i16 %t72, 0
br i1 %t73, label %bb74, label %bb91
bb74:
%t75 = load i16* %t58, align 2
%t76 = icmp eq i16 %t75, 0
br i1 %t76, label %bb77, label %bb91
bb77:
%t78 = load i16* %t60, align 2
%t79 = icmp eq i16 %t78, 0
br i1 %t79, label %bb80, label %bb91
bb80:
%t81 = load i16* %t62, align 2
%t82 = icmp eq i16 %t81, 0
br i1 %t82, label %bb83, label %bb91
bb83:
%t84 = load i16* %t64, align 2
%t85 = icmp eq i16 %t84, 0
br i1 %t85, label %bb86, label %bb91
bb86:
%t87 = load i16* %t65, align 2
%t88 = sitofp i16 %t87 to float
%t89 = load float* %t49, align 4
%t90 = fmul float %t88, %t89
store float %t90, float* %t25, align 4
store float %t90, float* %t12, align 4
store float %t90, float* %t14, align 4
store float %t90, float* %t16, align 4
store float %t90, float* %t18, align 4
store float %t90, float* %t20, align 4
store float %t90, float* %t22, align 4
store float %t90, float* %t24, align 4
br label %bb156
bb91:
%t92 = load i16* %t65, align 2
%t93 = sitofp i16 %t92 to float
%t94 = load float* %t49, align 4
%t95 = fmul float %t93, %t94
%t96 = sitofp i16 %t68 to float
%t97 = load float* %t29, align 4
%t98 = fmul float %t96, %t97
%t99 = load i16* %t58, align 2
%t100 = sitofp i16 %t99 to float
%t101 = load float* %t32, align 4
%t102 = fmul float %t100, %t101
%t103 = load i16* %t62, align 2
%t104 = sitofp i16 %t103 to float
%t105 = load float* %t35, align 4
%t106 = fmul float %t104, %t105
%t107 = fadd float %t95, %t102
%t108 = fsub float %t95, %t102
%t109 = fadd float %t98, %t106
%t110 = fsub float %t98, %t106
%t111 = fmul float %t110, 0x3FF6A09E60000000
%t112 = fsub float %t111, %t109
%t113 = fadd float %t107, %t109
%t114 = fsub float %t107, %t109
%t115 = fadd float %t108, %t112
%t116 = fsub float %t108, %t112
%t117 = sitofp i16 %t66 to float
%t118 = load float* %t38, align 4
%t119 = fmul float %t117, %t118
%t120 = load i16* %t56, align 2
%t121 = sitofp i16 %t120 to float
%t122 = load float* %t41, align 4
%t123 = fmul float %t121, %t122
%t124 = load i16* %t60, align 2
%t125 = sitofp i16 %t124 to float
%t126 = load float* %t44, align 4
%t127 = fmul float %t125, %t126
%t128 = load i16* %t64, align 2
%t129 = sitofp i16 %t128 to float
%t130 = load float* %t47, align 4
%t131 = fmul float %t129, %t130
%t132 = fadd float %t127, %t123
%t133 = fsub float %t127, %t123
%t134 = fadd float %t119, %t131
%t135 = fsub float %t119, %t131
%t136 = fadd float %t134, %t132
%t137 = fsub float %t134, %t132
%t138 = fmul float %t137, 0x3FF6A09E60000000
%t139 = fadd float %t133, %t135
%t140 = fmul float %t139, 0x3FFD906BC0000000
%t141 = fmul float %t135, 0x3FF1517A80000000
%t142 = fsub float %t141, %t140
%t143 = fmul float %t133, 0xC004E7AEA0000000
%t144 = fadd float %t143, %t140
%t145 = fsub float %t144, %t136
%t146 = fsub float %t138, %t145
%t147 = fadd float %t142, %t146
%t148 = fadd float %t113, %t136
store float %t148, float* %t25, align 4
%t149 = fsub float %t113, %t136
store float %t149, float* %t24, align 4
%t150 = fadd float %t115, %t145
store float %t150, float* %t12, align 4
%t151 = fsub float %t115, %t145
store float %t151, float* %t22, align 4
%t152 = fadd float %t116, %t146
store float %t152, float* %t14, align 4
%t153 = fsub float %t116, %t146
store float %t153, float* %t20, align 4
%t154 = fadd float %t114, %t147
store float %t154, float* %t18, align 4
%t155 = fsub float %t114, %t147
store float %t155, float* %t16, align 4
br label %bb156
bb156:
%t157 = add i32 %t10, 1
%t158 = icmp eq i32 %t157, 8
br i1 %t158, label %bb159, label %bb9
bb159:
%t160 = add i32 %a4, 7
%t161 = add i32 %a4, 1
%t162 = add i32 %a4, 6
%t163 = add i32 %a4, 2
%t164 = add i32 %a4, 5
%t165 = add i32 %a4, 4
%t166 = add i32 %a4, 3
br label %bb167
bb167:
%t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ]
%t169 = getelementptr i8** %a3, i32 %t168
%t170 = shl i32 %t168, 3
%t171 = or i32 %t170, 4
%t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171
%t173 = or i32 %t170, 2
%t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173
%t175 = or i32 %t170, 6
%t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175
%t177 = or i32 %t170, 5
%t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177
%t179 = or i32 %t170, 3
%t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179
%t181 = or i32 %t170, 1
%t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181
%t183 = or i32 %t170, 7
%t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183
%t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170
%t186 = load i8** %t169, align 4
%t187 = getelementptr inbounds i8* %t186, i32 %a4
%t188 = load float* %t185, align 4
%t189 = load float* %t172, align 4
%t190 = fadd float %t188, %t189
%t191 = fsub float %t188, %t189
%t192 = load float* %t174, align 4
%t193 = load float* %t176, align 4
%t194 = fadd float %t192, %t193
%t195 = fsub float %t192, %t193
%t196 = fmul float %t195, 0x3FF6A09E60000000
%t197 = fsub float %t196, %t194
%t198 = fadd float %t190, %t194
%t199 = fsub float %t190, %t194
%t200 = fadd float %t191, %t197
%t201 = fsub float %t191, %t197
%t202 = load float* %t178, align 4
%t203 = load float* %t180, align 4
%t204 = fadd float %t202, %t203
%t205 = fsub float %t202, %t203
%t206 = load float* %t182, align 4
%t207 = load float* %t184, align 4
%t208 = fadd float %t206, %t207
%t209 = fsub float %t206, %t207
%t210 = fadd float %t208, %t204
%t211 = fsub float %t208, %t204
%t212 = fmul float %t211, 0x3FF6A09E60000000
%t213 = fadd float %t205, %t209
%t214 = fmul float %t213, 0x3FFD906BC0000000
%t215 = fmul float %t209, 0x3FF1517A80000000
%t216 = fsub float %t215, %t214
%t217 = fmul float %t205, 0xC004E7AEA0000000
%t218 = fadd float %t217, %t214
%t219 = fsub float %t218, %t210
%t220 = fsub float %t212, %t219
%t221 = fadd float %t216, %t220
%t222 = fadd float %t198, %t210
%t223 = fptosi float %t222 to i32
%t224 = add nsw i32 %t223, 4
%t225 = lshr i32 %t224, 3
%t226 = and i32 %t225, 1023
%t227 = add i32 %t226, 128
%t228 = getelementptr inbounds i8* %t6, i32 %t227
%t229 = load i8* %t228, align 1
store i8 %t229, i8* %t187, align 1
%t230 = fsub float %t198, %t210
%t231 = fptosi float %t230 to i32
%t232 = add nsw i32 %t231, 4
%t233 = lshr i32 %t232, 3
%t234 = and i32 %t233, 1023
%t235 = add i32 %t234, 128
%t236 = getelementptr inbounds i8* %t6, i32 %t235
%t237 = load i8* %t236, align 1
%t238 = getelementptr inbounds i8* %t186, i32 %t160
store i8 %t237, i8* %t238, align 1
%t239 = fadd float %t200, %t219
%t240 = fptosi float %t239 to i32
%t241 = add nsw i32 %t240, 4
%t242 = lshr i32 %t241, 3
%t243 = and i32 %t242, 1023
%t244 = add i32 %t243, 128
%t245 = getelementptr inbounds i8* %t6, i32 %t244
%t246 = load i8* %t245, align 1
%t247 = getelementptr inbounds i8* %t186, i32 %t161
store i8 %t246, i8* %t247, align 1
%t248 = fsub float %t200, %t219
%t249 = fptosi float %t248 to i32
%t250 = add nsw i32 %t249, 4
%t251 = lshr i32 %t250, 3
%t252 = and i32 %t251, 1023
%t253 = add i32 %t252, 128
%t254 = getelementptr inbounds i8* %t6, i32 %t253
%t255 = load i8* %t254, align 1
%t256 = getelementptr inbounds i8* %t186, i32 %t162
store i8 %t255, i8* %t256, align 1
%t257 = fadd float %t201, %t220
%t258 = fptosi float %t257 to i32
%t259 = add nsw i32 %t258, 4
%t260 = lshr i32 %t259, 3
%t261 = and i32 %t260, 1023
%t262 = add i32 %t261, 128
%t263 = getelementptr inbounds i8* %t6, i32 %t262
%t264 = load i8* %t263, align 1
%t265 = getelementptr inbounds i8* %t186, i32 %t163
store i8 %t264, i8* %t265, align 1
%t266 = fsub float %t201, %t220
%t267 = fptosi float %t266 to i32
%t268 = add nsw i32 %t267, 4
%t269 = lshr i32 %t268, 3
%t270 = and i32 %t269, 1023
%t271 = add i32 %t270, 128
%t272 = getelementptr inbounds i8* %t6, i32 %t271
%t273 = load i8* %t272, align 1
%t274 = getelementptr inbounds i8* %t186, i32 %t164
store i8 %t273, i8* %t274, align 1
%t275 = fadd float %t199, %t221
%t276 = fptosi float %t275 to i32
%t277 = add nsw i32 %t276, 4
%t278 = lshr i32 %t277, 3
%t279 = and i32 %t278, 1023
%t280 = add i32 %t279, 128
%t281 = getelementptr inbounds i8* %t6, i32 %t280
%t282 = load i8* %t281, align 1
%t283 = getelementptr inbounds i8* %t186, i32 %t165
store i8 %t282, i8* %t283, align 1
%t284 = fsub float %t199, %t221
%t285 = fptosi float %t284 to i32
%t286 = add nsw i32 %t285, 4
%t287 = lshr i32 %t286, 3
%t288 = and i32 %t287, 1023
%t289 = add i32 %t288, 128
%t290 = getelementptr inbounds i8* %t6, i32 %t289
%t291 = load i8* %t290, align 1
%t292 = getelementptr inbounds i8* %t186, i32 %t166
store i8 %t291, i8* %t292, align 1
%t293 = add nsw i32 %t168, 1
%t294 = icmp eq i32 %t293, 8
br i1 %t294, label %bb295, label %bb167
bb295:
ret void
}