Teach LSR how to cope better with unrolled loops on targets where

the addressing modes don't make this trivially easy. This allows it to avoid falling into the less precise heuristics in more cases. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@104186 91177308-0d34-0410-b5e6-96231b3b80d8
2025-07-13 04:24:40 +00:00 · 2010-05-19 23:43:12 +00:00
parent 492fd454ca
commit a2086b3483
2 changed files with 577 additions and 3 deletions
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -113,6 +113,7 @@ class RegUseTracker {
 public:
  void CountRegister(const SCEV *Reg, size_t LUIdx);
  void DropRegister(const SCEV *Reg, size_t LUIdx);
  void DropUse(size_t LUIdx);
  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
@ -150,6 +151,14 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
  RSD.UsedByIndices.reset(LUIdx);
 }
 void
 RegUseTracker::DropUse(size_t LUIdx) {
  // Remove the use index from every register's use list.
  for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
       I != E; ++I)
    I->second.UsedByIndices.reset(LUIdx);
 }
 bool
 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
  if (!RegUsesMap.count(Reg)) return false;
@ -951,6 +960,7 @@ public:
                                      MaxOffset(INT64_MIN),
                                      AllFixupsOutsideLoop(true) {}
  bool HasFormulaWithSameRegs(const Formula &F) const;
  bool InsertFormula(const Formula &F);
  void DeleteFormula(Formula &F);
  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
@ -961,6 +971,16 @@ public:
  void dump() const;
 };
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
  if (F.ScaledReg) Key.push_back(F.ScaledReg);
  // Unstable sort by host order ok, because this is only used for uniquifying.
  std::sort(Key.begin(), Key.end());
  return Uniquifier.count(Key);
 }
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
@ -995,6 +1015,7 @@ bool LSRUse::InsertFormula(const Formula &F) {
 void LSRUse::DeleteFormula(Formula &F) {
  std::swap(F, Formulae.back());
  Formulae.pop_back();
  assert(!Formulae.empty() && "LSRUse has no formulae left!");
 }
 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
@ -1134,6 +1155,13 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
  // Canonicalize a scale of 1 to a base register if the formula doesn't
  // already have a base register.
  if (!AM.HasBaseReg && AM.Scale == 1) {
    AM.Scale = 0;
    AM.HasBaseReg = true;
  }
  return isLegalUse(AM, Kind, AccessTy, TLI);
 }
@ -1244,12 +1272,15 @@ class LSRInstance {
  UseMapTy UseMap;
  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
                          bool HasBaseReg,
                          LSRUse::KindType Kind, const Type *AccessTy);
  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
                                    LSRUse::KindType Kind,
                                    const Type *AccessTy);
  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
 public:
  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
@ -1742,6 +1773,7 @@ LSRInstance::OptimizeLoopTermCond() {
 bool
 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
                                bool HasBaseReg,
                                LSRUse::KindType Kind, const Type *AccessTy) {
  int64_t NewMinOffset = LU.MinOffset;
  int64_t NewMaxOffset = LU.MaxOffset;
@ -1754,12 +1786,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
    return false;
  // Conservatively assume HasBaseReg is true for now.
  if (NewOffset < LU.MinOffset) {
-    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true,
+    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
                          Kind, AccessTy, TLI))
      return false;
    NewMinOffset = NewOffset;
  } else if (NewOffset > LU.MaxOffset) {
-    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true,
+    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
                          Kind, AccessTy, TLI))
      return false;
    NewMaxOffset = NewOffset;
@ -1798,7 +1830,7 @@ LSRInstance::getUse(const SCEV *&Expr,
    // A use already existed with this base.
    size_t LUIdx = P.first->second;
    LSRUse &LU = Uses[LUIdx];
-    if (reconcileNewOffset(LU, Offset, Kind, AccessTy))
+    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
      // Reuse this use.
      return std::make_pair(LUIdx, Offset);
  }
@ -1819,6 +1851,40 @@ LSRInstance::getUse(const SCEV *&Expr,
  return std::make_pair(LUIdx, Offset);
 }
 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
 /// a formula that has the same registers as the given formula.
 LSRUse *
 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
                                       const LSRUse &OrigLU) {
  // Search all uses for the formula. This could be more clever. Ignore
  // ICmpZero uses because they may contain formulae generated by
  // GenerateICmpZeroScales, in which case adding fixup offsets may
  // be invalid.
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    if (&LU != &OrigLU &&
        LU.Kind != LSRUse::ICmpZero &&
        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
        LU.HasFormulaWithSameRegs(OrigF)) {
      for (size_t FIdx = 0, NumForms = LU.Formulae.size();
           FIdx != NumForms; ++FIdx) {
        Formula &F = LU.Formulae[FIdx];
        if (F.BaseRegs == OrigF.BaseRegs &&
            F.ScaledReg == OrigF.ScaledReg &&
            F.AM.BaseGV == OrigF.AM.BaseGV &&
            F.AM.Scale == OrigF.AM.Scale &&
            LU.Kind) {
          if (F.AM.BaseOffs == 0)
            return &LU;
          break;
        }
      }
    }
  }
  return 0;
 }
 void LSRInstance::CollectInterestingTypesAndFactors() {
  SmallSetVector<const SCEV *, 4> Strides;
@ -2722,6 +2788,128 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
 /// of formulae. This keeps the main solver from taking an extraordinary amount
 /// of time in some worst-case scenarios.
 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    DEBUG(dbgs() << "The search space is too complex.\n");
    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
                    "which use a superset of registers used by other "
                    "formulae.\n");
    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
      LSRUse &LU = Uses[LUIdx];
      bool Any = false;
      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
        Formula &F = LU.Formulae[i];
        for (SmallVectorImpl<const SCEV *>::const_iterator
             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
            Formula NewF = F;
            NewF.AM.BaseOffs += C->getValue()->getSExtValue();
            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                (I - F.BaseRegs.begin()));
            if (LU.HasFormulaWithSameRegs(NewF)) {
              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
              LU.DeleteFormula(F);
              --i;
              --e;
              Any = true;
              break;
            }
          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
              if (!F.AM.BaseGV) {
                Formula NewF = F;
                NewF.AM.BaseGV = GV;
                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                    (I - F.BaseRegs.begin()));
                if (LU.HasFormulaWithSameRegs(NewF)) {
                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
                        dbgs() << '\n');
                  LU.DeleteFormula(F);
                  --i;
                  --e;
                  Any = true;
                  break;
                }
              }
          }
        }
      }
      if (Any)
        LU.RecomputeRegs(LUIdx, RegUses);
    }
    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    DEBUG(dbgs() << "The search space is too complex.\n");
    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
                    "separated by a constant offset will use the same "
                    "registers.\n");
    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
      LSRUse &LU = Uses[LUIdx];
      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
        Formula &F = LU.Formulae[i];
        if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
            if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
                                   /*HasBaseReg=*/false,
                                   LU.Kind, LU.AccessTy)) {
              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
                    dbgs() << '\n');
              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
              // Delete formulae from the new use which are no longer legal.
              bool Any = false;
              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
                Formula &F = LUThatHas->Formulae[i];
                if (!isLegalUse(F.AM,
                                LUThatHas->MinOffset, LUThatHas->MaxOffset,
                                LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
                        dbgs() << '\n');
                  LUThatHas->DeleteFormula(F);
                  --i;
                  --e;
                  Any = true;
                }
              }
              if (Any)
                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
              // Update the relocs to reference the new use.
              for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
                if (Fixups[i].LUIdx == LUIdx) {
                  Fixups[i].LUIdx = LUThatHas - &Uses.front();
                  Fixups[i].Offset += F.AM.BaseOffs;
                  DEBUG(errs() << "New fixup has offset "
                               << Fixups[i].Offset << "\n");
                }
                if (Fixups[i].LUIdx == NumUses-1)
                  Fixups[i].LUIdx = LUIdx;
              }
              // Delete the old use.
              std::swap(LU, Uses.back());
              Uses.pop_back();
              --LUIdx;
              --NumUses;
              break;
            }
          }
        }
      }
    }
    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
  SmallPtrSet<const SCEV *, 4> Taken;
  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    // Ok, we have too many of formulae on our hands to conveniently handle.
--- a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
+++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
@ -0,0 +1,386 @@
 ; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
 ; LSR should recognize that this is an unrolled loop which can use
 ; constant offset addressing, so that each of the following stores
 ; uses the same register.
 ; CHECK: vstr.32 s0, [r12, #-128]
 ; CHECK: vstr.32 s0, [r12, #-96]
 ; CHECK: vstr.32 s0, [r12, #-64]
 ; CHECK: vstr.32 s0, [r12, #-32]
 ; CHECK: vstr.32 s0, [r12]
 ; CHECK: vstr.32 s0, [r12, #32]
 ; CHECK: vstr.32 s0, [r12, #64]
 ; CHECK: vstr.32 s0, [r12, #96]
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
 %0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
 %1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 }
 %2 = type { %1*, %3*, %6*, i8*, i32, i32 }
 %3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 }
 %4 = type opaque
 %5 = type opaque
 %6 = type { void (%2*)*, i32, i32, i32, i32 }
 %7 = type { [8 x i32], [12 x i32] }
 %8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* }
 %9 = type { [64 x i16], i32 }
 %10 = type { [17 x i8], [256 x i8], i32 }
 %11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* }
 %12 = type { %12*, i8, i32, i32, i8* }
 %13 = type { void (%0*)*, void (%0*)*, i32 }
 %14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* }
 %15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** }
 %16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* }
 %17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 }
 %18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 }
 %19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 }
 %20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] }
 %21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 }
 %22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* }
 %23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* }
 define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind {
 bb:
  %t = alloca [64 x float], align 4           
  %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65
  %t6 = load i8** %t5, align 4              
  %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20
  %t8 = load i8** %t7, align 4              
  br label %bb9
 bb9:                                            
  %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ]
  %t11 = add i32 %t10, 8                    
  %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11
  %t13 = add i32 %t10, 16                   
  %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13
  %t15 = add i32 %t10, 24                   
  %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15
  %t17 = add i32 %t10, 32                   
  %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17
  %t19 = add i32 %t10, 40                   
  %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19
  %t21 = add i32 %t10, 48                   
  %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21
  %t23 = add i32 %t10, 56                   
  %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23
  %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10
  %t26 = shl i32 %t10, 5                    
  %t27 = or i32 %t26, 8                     
  %t28 = getelementptr i8* %t8, i32 %t27  
  %t29 = bitcast i8* %t28 to float*         
  %t30 = or i32 %t26, 16                    
  %t31 = getelementptr i8* %t8, i32 %t30  
  %t32 = bitcast i8* %t31 to float*         
  %t33 = or i32 %t26, 24                    
  %t34 = getelementptr i8* %t8, i32 %t33  
  %t35 = bitcast i8* %t34 to float*         
  %t36 = or i32 %t26, 4                     
  %t37 = getelementptr i8* %t8, i32 %t36  
  %t38 = bitcast i8* %t37 to float*         
  %t39 = or i32 %t26, 12                    
  %t40 = getelementptr i8* %t8, i32 %t39  
  %t41 = bitcast i8* %t40 to float*         
  %t42 = or i32 %t26, 20                    
  %t43 = getelementptr i8* %t8, i32 %t42  
  %t44 = bitcast i8* %t43 to float*         
  %t45 = or i32 %t26, 28                    
  %t46 = getelementptr i8* %t8, i32 %t45  
  %t47 = bitcast i8* %t46 to float*         
  %t48 = getelementptr i8* %t8, i32 %t26  
  %t49 = bitcast i8* %t48 to float*         
  %t50 = shl i32 %t10, 3                    
  %t51 = or i32 %t50, 1                     
  %t52 = getelementptr i16* %a2, i32 %t51 
  %t53 = or i32 %t50, 2                     
  %t54 = getelementptr i16* %a2, i32 %t53 
  %t55 = or i32 %t50, 3                     
  %t56 = getelementptr i16* %a2, i32 %t55 
  %t57 = or i32 %t50, 4                     
  %t58 = getelementptr i16* %a2, i32 %t57 
  %t59 = or i32 %t50, 5                     
  %t60 = getelementptr i16* %a2, i32 %t59 
  %t61 = or i32 %t50, 6                     
  %t62 = getelementptr i16* %a2, i32 %t61 
  %t63 = or i32 %t50, 7                     
  %t64 = getelementptr i16* %a2, i32 %t63 
  %t65 = getelementptr i16* %a2, i32 %t50 
  %t66 = load i16* %t52, align 2            
  %t67 = icmp eq i16 %t66, 0                
  %t68 = load i16* %t54, align 2            
  %t69 = icmp eq i16 %t68, 0                
  %t70 = and i1 %t67, %t69                
  br i1 %t70, label %bb71, label %bb91
 bb71:                                           
  %t72 = load i16* %t56, align 2            
  %t73 = icmp eq i16 %t72, 0                
  br i1 %t73, label %bb74, label %bb91
 bb74:                                           
  %t75 = load i16* %t58, align 2            
  %t76 = icmp eq i16 %t75, 0                
  br i1 %t76, label %bb77, label %bb91
 bb77:                                           
  %t78 = load i16* %t60, align 2            
  %t79 = icmp eq i16 %t78, 0                
  br i1 %t79, label %bb80, label %bb91
 bb80:                                           
  %t81 = load i16* %t62, align 2            
  %t82 = icmp eq i16 %t81, 0                
  br i1 %t82, label %bb83, label %bb91
 bb83:                                           
  %t84 = load i16* %t64, align 2            
  %t85 = icmp eq i16 %t84, 0                
  br i1 %t85, label %bb86, label %bb91
 bb86:                                           
  %t87 = load i16* %t65, align 2            
  %t88 = sitofp i16 %t87 to float           
  %t89 = load float* %t49, align 4          
  %t90 = fmul float %t88, %t89            
  store float %t90, float* %t25, align 4
  store float %t90, float* %t12, align 4
  store float %t90, float* %t14, align 4
  store float %t90, float* %t16, align 4
  store float %t90, float* %t18, align 4
  store float %t90, float* %t20, align 4
  store float %t90, float* %t22, align 4
  store float %t90, float* %t24, align 4
  br label %bb156
 bb91:                                           
  %t92 = load i16* %t65, align 2            
  %t93 = sitofp i16 %t92 to float           
  %t94 = load float* %t49, align 4          
  %t95 = fmul float %t93, %t94            
  %t96 = sitofp i16 %t68 to float           
  %t97 = load float* %t29, align 4          
  %t98 = fmul float %t96, %t97            
  %t99 = load i16* %t58, align 2            
  %t100 = sitofp i16 %t99 to float          
  %t101 = load float* %t32, align 4         
  %t102 = fmul float %t100, %t101         
  %t103 = load i16* %t62, align 2           
  %t104 = sitofp i16 %t103 to float         
  %t105 = load float* %t35, align 4         
  %t106 = fmul float %t104, %t105         
  %t107 = fadd float %t95, %t102          
  %t108 = fsub float %t95, %t102          
  %t109 = fadd float %t98, %t106          
  %t110 = fsub float %t98, %t106          
  %t111 = fmul float %t110, 0x3FF6A09E60000000
  %t112 = fsub float %t111, %t109         
  %t113 = fadd float %t107, %t109         
  %t114 = fsub float %t107, %t109         
  %t115 = fadd float %t108, %t112         
  %t116 = fsub float %t108, %t112         
  %t117 = sitofp i16 %t66 to float          
  %t118 = load float* %t38, align 4         
  %t119 = fmul float %t117, %t118         
  %t120 = load i16* %t56, align 2           
  %t121 = sitofp i16 %t120 to float         
  %t122 = load float* %t41, align 4         
  %t123 = fmul float %t121, %t122         
  %t124 = load i16* %t60, align 2           
  %t125 = sitofp i16 %t124 to float         
  %t126 = load float* %t44, align 4         
  %t127 = fmul float %t125, %t126         
  %t128 = load i16* %t64, align 2           
  %t129 = sitofp i16 %t128 to float         
  %t130 = load float* %t47, align 4         
  %t131 = fmul float %t129, %t130         
  %t132 = fadd float %t127, %t123         
  %t133 = fsub float %t127, %t123         
  %t134 = fadd float %t119, %t131         
  %t135 = fsub float %t119, %t131         
  %t136 = fadd float %t134, %t132         
  %t137 = fsub float %t134, %t132         
  %t138 = fmul float %t137, 0x3FF6A09E60000000
  %t139 = fadd float %t133, %t135         
  %t140 = fmul float %t139, 0x3FFD906BC0000000
  %t141 = fmul float %t135, 0x3FF1517A80000000
  %t142 = fsub float %t141, %t140         
  %t143 = fmul float %t133, 0xC004E7AEA0000000
  %t144 = fadd float %t143, %t140         
  %t145 = fsub float %t144, %t136         
  %t146 = fsub float %t138, %t145         
  %t147 = fadd float %t142, %t146         
  %t148 = fadd float %t113, %t136         
  store float %t148, float* %t25, align 4
  %t149 = fsub float %t113, %t136         
  store float %t149, float* %t24, align 4
  %t150 = fadd float %t115, %t145         
  store float %t150, float* %t12, align 4
  %t151 = fsub float %t115, %t145         
  store float %t151, float* %t22, align 4
  %t152 = fadd float %t116, %t146         
  store float %t152, float* %t14, align 4
  %t153 = fsub float %t116, %t146         
  store float %t153, float* %t20, align 4
  %t154 = fadd float %t114, %t147         
  store float %t154, float* %t18, align 4
  %t155 = fsub float %t114, %t147         
  store float %t155, float* %t16, align 4
  br label %bb156
 bb156:                                          
  %t157 = add i32 %t10, 1                   
  %t158 = icmp eq i32 %t157, 8              
  br i1 %t158, label %bb159, label %bb9
 bb159:                                          
  %t160 = add i32 %a4, 7                    
  %t161 = add i32 %a4, 1                    
  %t162 = add i32 %a4, 6                    
  %t163 = add i32 %a4, 2                    
  %t164 = add i32 %a4, 5                    
  %t165 = add i32 %a4, 4                    
  %t166 = add i32 %a4, 3                    
  br label %bb167
 bb167:                                          
  %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ]
  %t169 = getelementptr i8** %a3, i32 %t168
  %t170 = shl i32 %t168, 3                  
  %t171 = or i32 %t170, 4                   
  %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171
  %t173 = or i32 %t170, 2                   
  %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173
  %t175 = or i32 %t170, 6                   
  %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175
  %t177 = or i32 %t170, 5                   
  %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177
  %t179 = or i32 %t170, 3                   
  %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179
  %t181 = or i32 %t170, 1                   
  %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181
  %t183 = or i32 %t170, 7                   
  %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183
  %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170
  %t186 = load i8** %t169, align 4          
  %t187 = getelementptr inbounds i8* %t186, i32 %a4
  %t188 = load float* %t185, align 4        
  %t189 = load float* %t172, align 4        
  %t190 = fadd float %t188, %t189         
  %t191 = fsub float %t188, %t189         
  %t192 = load float* %t174, align 4        
  %t193 = load float* %t176, align 4        
  %t194 = fadd float %t192, %t193         
  %t195 = fsub float %t192, %t193         
  %t196 = fmul float %t195, 0x3FF6A09E60000000
  %t197 = fsub float %t196, %t194         
  %t198 = fadd float %t190, %t194         
  %t199 = fsub float %t190, %t194         
  %t200 = fadd float %t191, %t197         
  %t201 = fsub float %t191, %t197         
  %t202 = load float* %t178, align 4        
  %t203 = load float* %t180, align 4        
  %t204 = fadd float %t202, %t203         
  %t205 = fsub float %t202, %t203         
  %t206 = load float* %t182, align 4        
  %t207 = load float* %t184, align 4        
  %t208 = fadd float %t206, %t207         
  %t209 = fsub float %t206, %t207         
  %t210 = fadd float %t208, %t204         
  %t211 = fsub float %t208, %t204         
  %t212 = fmul float %t211, 0x3FF6A09E60000000
  %t213 = fadd float %t205, %t209         
  %t214 = fmul float %t213, 0x3FFD906BC0000000
  %t215 = fmul float %t209, 0x3FF1517A80000000
  %t216 = fsub float %t215, %t214         
  %t217 = fmul float %t205, 0xC004E7AEA0000000
  %t218 = fadd float %t217, %t214         
  %t219 = fsub float %t218, %t210         
  %t220 = fsub float %t212, %t219         
  %t221 = fadd float %t216, %t220         
  %t222 = fadd float %t198, %t210         
  %t223 = fptosi float %t222 to i32         
  %t224 = add nsw i32 %t223, 4              
  %t225 = lshr i32 %t224, 3                 
  %t226 = and i32 %t225, 1023               
  %t227 = add i32 %t226, 128                
  %t228 = getelementptr inbounds i8* %t6, i32 %t227
  %t229 = load i8* %t228, align 1           
  store i8 %t229, i8* %t187, align 1
  %t230 = fsub float %t198, %t210         
  %t231 = fptosi float %t230 to i32         
  %t232 = add nsw i32 %t231, 4              
  %t233 = lshr i32 %t232, 3                 
  %t234 = and i32 %t233, 1023               
  %t235 = add i32 %t234, 128                
  %t236 = getelementptr inbounds i8* %t6, i32 %t235
  %t237 = load i8* %t236, align 1           
  %t238 = getelementptr inbounds i8* %t186, i32 %t160
  store i8 %t237, i8* %t238, align 1
  %t239 = fadd float %t200, %t219         
  %t240 = fptosi float %t239 to i32         
  %t241 = add nsw i32 %t240, 4              
  %t242 = lshr i32 %t241, 3                 
  %t243 = and i32 %t242, 1023               
  %t244 = add i32 %t243, 128                
  %t245 = getelementptr inbounds i8* %t6, i32 %t244
  %t246 = load i8* %t245, align 1           
  %t247 = getelementptr inbounds i8* %t186, i32 %t161
  store i8 %t246, i8* %t247, align 1
  %t248 = fsub float %t200, %t219         
  %t249 = fptosi float %t248 to i32         
  %t250 = add nsw i32 %t249, 4              
  %t251 = lshr i32 %t250, 3                 
  %t252 = and i32 %t251, 1023               
  %t253 = add i32 %t252, 128                
  %t254 = getelementptr inbounds i8* %t6, i32 %t253
  %t255 = load i8* %t254, align 1           
  %t256 = getelementptr inbounds i8* %t186, i32 %t162
  store i8 %t255, i8* %t256, align 1
  %t257 = fadd float %t201, %t220         
  %t258 = fptosi float %t257 to i32         
  %t259 = add nsw i32 %t258, 4              
  %t260 = lshr i32 %t259, 3                 
  %t261 = and i32 %t260, 1023               
  %t262 = add i32 %t261, 128                
  %t263 = getelementptr inbounds i8* %t6, i32 %t262
  %t264 = load i8* %t263, align 1           
  %t265 = getelementptr inbounds i8* %t186, i32 %t163
  store i8 %t264, i8* %t265, align 1
  %t266 = fsub float %t201, %t220         
  %t267 = fptosi float %t266 to i32         
  %t268 = add nsw i32 %t267, 4              
  %t269 = lshr i32 %t268, 3                 
  %t270 = and i32 %t269, 1023               
  %t271 = add i32 %t270, 128                
  %t272 = getelementptr inbounds i8* %t6, i32 %t271
  %t273 = load i8* %t272, align 1           
  %t274 = getelementptr inbounds i8* %t186, i32 %t164
  store i8 %t273, i8* %t274, align 1
  %t275 = fadd float %t199, %t221         
  %t276 = fptosi float %t275 to i32         
  %t277 = add nsw i32 %t276, 4              
  %t278 = lshr i32 %t277, 3                 
  %t279 = and i32 %t278, 1023               
  %t280 = add i32 %t279, 128                
  %t281 = getelementptr inbounds i8* %t6, i32 %t280
  %t282 = load i8* %t281, align 1           
  %t283 = getelementptr inbounds i8* %t186, i32 %t165
  store i8 %t282, i8* %t283, align 1
  %t284 = fsub float %t199, %t221         
  %t285 = fptosi float %t284 to i32         
  %t286 = add nsw i32 %t285, 4              
  %t287 = lshr i32 %t286, 3                 
  %t288 = and i32 %t287, 1023               
  %t289 = add i32 %t288, 128                
  %t290 = getelementptr inbounds i8* %t6, i32 %t289
  %t291 = load i8* %t290, align 1           
  %t292 = getelementptr inbounds i8* %t186, i32 %t166
  store i8 %t291, i8* %t292, align 1
  %t293 = add nsw i32 %t168, 1              
  %t294 = icmp eq i32 %t293, 8              
  br i1 %t294, label %bb295, label %bb167
 bb295:                                          
  ret void
 }