R600: Fix last ALU of a clause being emitted in a separate clause

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178675 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-09 01:38:03 +00:00 · 2013-04-03 18:24:47 +00:00 · 2013-04-03 18:24:47 +00:00 · 5417223f98
commit 5417223f98
parent b31b099a37
2 changed files with 852 additions and 2 deletions
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@ -187,6 +187,8 @@ private:
        continue;
      if (!isALU(I))
        break;
+      if (AluInstCount > TII->getMaxAlusPerClause())
+        break;
      if (I->getOpcode() == AMDGPU::PRED_X) {
        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
          PushBeforeModifier = true;
@ -201,8 +203,6 @@ private:
          !SubstituteKCacheBank(I, KCacheBanks))
        break;
      AluInstCount += OccupiedDwords(I);
-      if (AluInstCount > TII->getMaxAlusPerClause())
-        break;
    }
    unsigned Opcode = PushBeforeModifier ?
        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
--- a/test/CodeGen/R600/alu-split.ll
+++ b/test/CodeGen/R600/alu-split.ll
@ -0,0 +1,850 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ALU
+;CHECK: ALU
+;CHECK: ALU
+;CHECK-NOT: ALU
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fcmp une float 0x4016F2B020000000, %5
+  %7 = select i1 %6, float 1.000000e+00, float 0.000000e+00
+  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = fcmp une float 0x401FDCC640000000, %9
+  %11 = select i1 %10, float 1.000000e+00, float 0.000000e+00
+  %12 = fsub float -0.000000e+00, %7
+  %13 = fptosi float %12 to i32
+  %14 = fsub float -0.000000e+00, %11
+  %15 = fptosi float %14 to i32
+  %16 = bitcast i32 %13 to float
+  %17 = bitcast i32 %15 to float
+  %18 = bitcast float %16 to i32
+  %19 = bitcast float %17 to i32
+  %20 = or i32 %18, %19
+  %21 = bitcast i32 %20 to float
+  %22 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 17)
+  %23 = extractelement <4 x float> %22, i32 0
+  %24 = fcmp une float 0xC00574BC60000000, %23
+  %25 = select i1 %24, float 1.000000e+00, float 0.000000e+00
+  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 17)
+  %27 = extractelement <4 x float> %26, i32 1
+  %28 = fcmp une float 0x40210068E0000000, %27
+  %29 = select i1 %28, float 1.000000e+00, float 0.000000e+00
+  %30 = fsub float -0.000000e+00, %25
+  %31 = fptosi float %30 to i32
+  %32 = fsub float -0.000000e+00, %29
+  %33 = fptosi float %32 to i32
+  %34 = bitcast i32 %31 to float
+  %35 = bitcast i32 %33 to float
+  %36 = bitcast float %34 to i32
+  %37 = bitcast float %35 to i32
+  %38 = or i32 %36, %37
+  %39 = bitcast i32 %38 to float
+  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 18)
+  %41 = extractelement <4 x float> %40, i32 0
+  %42 = fcmp une float 0xBFC9A6B500000000, %41
+  %43 = select i1 %42, float 1.000000e+00, float 0.000000e+00
+  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 18)
+  %45 = extractelement <4 x float> %44, i32 1
+  %46 = fcmp une float 0xC0119BDA60000000, %45
+  %47 = select i1 %46, float 1.000000e+00, float 0.000000e+00
+  %48 = fsub float -0.000000e+00, %43
+  %49 = fptosi float %48 to i32
+  %50 = fsub float -0.000000e+00, %47
+  %51 = fptosi float %50 to i32
+  %52 = bitcast i32 %49 to float
+  %53 = bitcast i32 %51 to float
+  %54 = bitcast float %52 to i32
+  %55 = bitcast float %53 to i32
+  %56 = or i32 %54, %55
+  %57 = bitcast i32 %56 to float
+  %58 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 19)
+  %59 = extractelement <4 x float> %58, i32 0
+  %60 = fcmp une float 0xC02085D640000000, %59
+  %61 = select i1 %60, float 1.000000e+00, float 0.000000e+00
+  %62 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 19)
+  %63 = extractelement <4 x float> %62, i32 1
+  %64 = fcmp une float 0xBFD7C1BDA0000000, %63
+  %65 = select i1 %64, float 1.000000e+00, float 0.000000e+00
+  %66 = fsub float -0.000000e+00, %61
+  %67 = fptosi float %66 to i32
+  %68 = fsub float -0.000000e+00, %65
+  %69 = fptosi float %68 to i32
+  %70 = bitcast i32 %67 to float
+  %71 = bitcast i32 %69 to float
+  %72 = bitcast float %70 to i32
+  %73 = bitcast float %71 to i32
+  %74 = or i32 %72, %73
+  %75 = bitcast i32 %74 to float
+  %76 = insertelement <4 x float> undef, float %21, i32 0
+  %77 = insertelement <4 x float> %76, float %39, i32 1
+  %78 = insertelement <4 x float> %77, float %57, i32 2
+  %79 = insertelement <4 x float> %78, float %75, i32 3
+  %80 = insertelement <4 x float> undef, float %21, i32 0
+  %81 = insertelement <4 x float> %80, float %39, i32 1
+  %82 = insertelement <4 x float> %81, float %57, i32 2
+  %83 = insertelement <4 x float> %82, float %75, i32 3
+  %84 = call float @llvm.AMDGPU.dp4(<4 x float> %79, <4 x float> %83)
+  %85 = bitcast float %84 to i32
+  %86 = icmp ne i32 %85, 0
+  %87 = sext i1 %86 to i32
+  %88 = bitcast i32 %87 to float
+  %89 = bitcast float %88 to i32
+  %90 = xor i32 %89, -1
+  %91 = bitcast i32 %90 to float
+  %92 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 20)
+  %93 = extractelement <4 x float> %92, i32 0
+  %94 = fcmp une float 0x401FDCC640000000, %93
+  %95 = select i1 %94, float 1.000000e+00, float 0.000000e+00
+  %96 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 20)
+  %97 = extractelement <4 x float> %96, i32 1
+  %98 = fcmp une float 0xC00574BC60000000, %97
+  %99 = select i1 %98, float 1.000000e+00, float 0.000000e+00
+  %100 = fsub float -0.000000e+00, %95
+  %101 = fptosi float %100 to i32
+  %102 = fsub float -0.000000e+00, %99
+  %103 = fptosi float %102 to i32
+  %104 = bitcast i32 %101 to float
+  %105 = bitcast i32 %103 to float
+  %106 = bitcast float %104 to i32
+  %107 = bitcast float %105 to i32
+  %108 = or i32 %106, %107
+  %109 = bitcast i32 %108 to float
+  %110 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 21)
+  %111 = extractelement <4 x float> %110, i32 0
+  %112 = fcmp une float 0x40210068E0000000, %111
+  %113 = select i1 %112, float 1.000000e+00, float 0.000000e+00
+  %114 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 21)
+  %115 = extractelement <4 x float> %114, i32 1
+  %116 = fcmp une float 0xBFC9A6B500000000, %115
+  %117 = select i1 %116, float 1.000000e+00, float 0.000000e+00
+  %118 = fsub float -0.000000e+00, %113
+  %119 = fptosi float %118 to i32
+  %120 = fsub float -0.000000e+00, %117
+  %121 = fptosi float %120 to i32
+  %122 = bitcast i32 %119 to float
+  %123 = bitcast i32 %121 to float
+  %124 = bitcast float %122 to i32
+  %125 = bitcast float %123 to i32
+  %126 = or i32 %124, %125
+  %127 = bitcast i32 %126 to float
+  %128 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 22)
+  %129 = extractelement <4 x float> %128, i32 0
+  %130 = fcmp une float 0xC0119BDA60000000, %129
+  %131 = select i1 %130, float 1.000000e+00, float 0.000000e+00
+  %132 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 22)
+  %133 = extractelement <4 x float> %132, i32 1
+  %134 = fcmp une float 0xC02085D640000000, %133
+  %135 = select i1 %134, float 1.000000e+00, float 0.000000e+00
+  %136 = fsub float -0.000000e+00, %131
+  %137 = fptosi float %136 to i32
+  %138 = fsub float -0.000000e+00, %135
+  %139 = fptosi float %138 to i32
+  %140 = bitcast i32 %137 to float
+  %141 = bitcast i32 %139 to float
+  %142 = bitcast float %140 to i32
+  %143 = bitcast float %141 to i32
+  %144 = or i32 %142, %143
+  %145 = bitcast i32 %144 to float
+  %146 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %147 = extractelement <4 x float> %146, i32 0
+  %148 = fcmp une float 0xBFD7C1BDA0000000, %147
+  %149 = select i1 %148, float 1.000000e+00, float 0.000000e+00
+  %150 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %151 = extractelement <4 x float> %150, i32 1
+  %152 = fcmp une float 0x401E1D7DC0000000, %151
+  %153 = select i1 %152, float 1.000000e+00, float 0.000000e+00
+  %154 = fsub float -0.000000e+00, %149
+  %155 = fptosi float %154 to i32
+  %156 = fsub float -0.000000e+00, %153
+  %157 = fptosi float %156 to i32
+  %158 = bitcast i32 %155 to float
+  %159 = bitcast i32 %157 to float
+  %160 = bitcast float %158 to i32
+  %161 = bitcast float %159 to i32
+  %162 = or i32 %160, %161
+  %163 = bitcast i32 %162 to float
+  %164 = insertelement <4 x float> undef, float %109, i32 0
+  %165 = insertelement <4 x float> %164, float %127, i32 1
+  %166 = insertelement <4 x float> %165, float %145, i32 2
+  %167 = insertelement <4 x float> %166, float %163, i32 3
+  %168 = insertelement <4 x float> undef, float %109, i32 0
+  %169 = insertelement <4 x float> %168, float %127, i32 1
+  %170 = insertelement <4 x float> %169, float %145, i32 2
+  %171 = insertelement <4 x float> %170, float %163, i32 3
+  %172 = call float @llvm.AMDGPU.dp4(<4 x float> %167, <4 x float> %171)
+  %173 = bitcast float %172 to i32
+  %174 = icmp ne i32 %173, 0
+  %175 = sext i1 %174 to i32
+  %176 = bitcast i32 %175 to float
+  %177 = bitcast float %176 to i32
+  %178 = xor i32 %177, -1
+  %179 = bitcast i32 %178 to float
+  %180 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %181 = extractelement <4 x float> %180, i32 0
+  %182 = fcmp une float 0x401FDCC640000000, %181
+  %183 = select i1 %182, float 1.000000e+00, float 0.000000e+00
+  %184 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %185 = extractelement <4 x float> %184, i32 1
+  %186 = fcmp une float 0xC00574BC60000000, %185
+  %187 = select i1 %186, float 1.000000e+00, float 0.000000e+00
+  %188 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %189 = extractelement <4 x float> %188, i32 2
+  %190 = fcmp une float 0x40210068E0000000, %189
+  %191 = select i1 %190, float 1.000000e+00, float 0.000000e+00
+  %192 = fsub float -0.000000e+00, %183
+  %193 = fptosi float %192 to i32
+  %194 = fsub float -0.000000e+00, %187
+  %195 = fptosi float %194 to i32
+  %196 = fsub float -0.000000e+00, %191
+  %197 = fptosi float %196 to i32
+  %198 = bitcast i32 %193 to float
+  %199 = bitcast i32 %195 to float
+  %200 = bitcast i32 %197 to float
+  %201 = bitcast float %199 to i32
+  %202 = bitcast float %200 to i32
+  %203 = or i32 %201, %202
+  %204 = bitcast i32 %203 to float
+  %205 = bitcast float %198 to i32
+  %206 = bitcast float %204 to i32
+  %207 = or i32 %205, %206
+  %208 = bitcast i32 %207 to float
+  %209 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %210 = extractelement <4 x float> %209, i32 0
+  %211 = fcmp une float 0xBFC9A6B500000000, %210
+  %212 = select i1 %211, float 1.000000e+00, float 0.000000e+00
+  %213 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %214 = extractelement <4 x float> %213, i32 1
+  %215 = fcmp une float 0xC0119BDA60000000, %214
+  %216 = select i1 %215, float 1.000000e+00, float 0.000000e+00
+  %217 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %218 = extractelement <4 x float> %217, i32 2
+  %219 = fcmp une float 0xC02085D640000000, %218
+  %220 = select i1 %219, float 1.000000e+00, float 0.000000e+00
+  %221 = fsub float -0.000000e+00, %212
+  %222 = fptosi float %221 to i32
+  %223 = fsub float -0.000000e+00, %216
+  %224 = fptosi float %223 to i32
+  %225 = fsub float -0.000000e+00, %220
+  %226 = fptosi float %225 to i32
+  %227 = bitcast i32 %222 to float
+  %228 = bitcast i32 %224 to float
+  %229 = bitcast i32 %226 to float
+  %230 = bitcast float %228 to i32
+  %231 = bitcast float %229 to i32
+  %232 = or i32 %230, %231
+  %233 = bitcast i32 %232 to float
+  %234 = bitcast float %227 to i32
+  %235 = bitcast float %233 to i32
+  %236 = or i32 %234, %235
+  %237 = bitcast i32 %236 to float
+  %238 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %239 = extractelement <4 x float> %238, i32 0
+  %240 = fcmp une float 0xBFD7C1BDA0000000, %239
+  %241 = select i1 %240, float 1.000000e+00, float 0.000000e+00
+  %242 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %243 = extractelement <4 x float> %242, i32 1
+  %244 = fcmp une float 0x401E1D7DC0000000, %243
+  %245 = select i1 %244, float 1.000000e+00, float 0.000000e+00
+  %246 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %247 = extractelement <4 x float> %246, i32 2
+  %248 = fcmp une float 0xC019893740000000, %247
+  %249 = select i1 %248, float 1.000000e+00, float 0.000000e+00
+  %250 = fsub float -0.000000e+00, %241
+  %251 = fptosi float %250 to i32
+  %252 = fsub float -0.000000e+00, %245
+  %253 = fptosi float %252 to i32
+  %254 = fsub float -0.000000e+00, %249
+  %255 = fptosi float %254 to i32
+  %256 = bitcast i32 %251 to float
+  %257 = bitcast i32 %253 to float
+  %258 = bitcast i32 %255 to float
+  %259 = bitcast float %257 to i32
+  %260 = bitcast float %258 to i32
+  %261 = or i32 %259, %260
+  %262 = bitcast i32 %261 to float
+  %263 = bitcast float %256 to i32
+  %264 = bitcast float %262 to i32
+  %265 = or i32 %263, %264
+  %266 = bitcast i32 %265 to float
+  %267 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %268 = extractelement <4 x float> %267, i32 0
+  %269 = fcmp une float 0x40220F0D80000000, %268
+  %270 = select i1 %269, float 1.000000e+00, float 0.000000e+00
+  %271 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %272 = extractelement <4 x float> %271, i32 1
+  %273 = fcmp une float 0xC018E2EB20000000, %272
+  %274 = select i1 %273, float 1.000000e+00, float 0.000000e+00
+  %275 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %276 = extractelement <4 x float> %275, i32 2
+  %277 = fcmp une float 0xBFEA8DB8C0000000, %276
+  %278 = select i1 %277, float 1.000000e+00, float 0.000000e+00
+  %279 = fsub float -0.000000e+00, %270
+  %280 = fptosi float %279 to i32
+  %281 = fsub float -0.000000e+00, %274
+  %282 = fptosi float %281 to i32
+  %283 = fsub float -0.000000e+00, %278
+  %284 = fptosi float %283 to i32
+  %285 = bitcast i32 %280 to float
+  %286 = bitcast i32 %282 to float
+  %287 = bitcast i32 %284 to float
+  %288 = bitcast float %286 to i32
+  %289 = bitcast float %287 to i32
+  %290 = or i32 %288, %289
+  %291 = bitcast i32 %290 to float
+  %292 = bitcast float %285 to i32
+  %293 = bitcast float %291 to i32
+  %294 = or i32 %292, %293
+  %295 = bitcast i32 %294 to float
+  %296 = insertelement <4 x float> undef, float %208, i32 0
+  %297 = insertelement <4 x float> %296, float %237, i32 1
+  %298 = insertelement <4 x float> %297, float %266, i32 2
+  %299 = insertelement <4 x float> %298, float %295, i32 3
+  %300 = insertelement <4 x float> undef, float %208, i32 0
+  %301 = insertelement <4 x float> %300, float %237, i32 1
+  %302 = insertelement <4 x float> %301, float %266, i32 2
+  %303 = insertelement <4 x float> %302, float %295, i32 3
+  %304 = call float @llvm.AMDGPU.dp4(<4 x float> %299, <4 x float> %303)
+  %305 = bitcast float %304 to i32
+  %306 = icmp ne i32 %305, 0
+  %307 = sext i1 %306 to i32
+  %308 = bitcast i32 %307 to float
+  %309 = bitcast float %308 to i32
+  %310 = xor i32 %309, -1
+  %311 = bitcast i32 %310 to float
+  %312 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %313 = extractelement <4 x float> %312, i32 0
+  %314 = fcmp une float 0xC00574BC60000000, %313
+  %315 = select i1 %314, float 1.000000e+00, float 0.000000e+00
+  %316 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %317 = extractelement <4 x float> %316, i32 1
+  %318 = fcmp une float 0x40210068E0000000, %317
+  %319 = select i1 %318, float 1.000000e+00, float 0.000000e+00
+  %320 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %321 = extractelement <4 x float> %320, i32 2
+  %322 = fcmp une float 0xBFC9A6B500000000, %321
+  %323 = select i1 %322, float 1.000000e+00, float 0.000000e+00
+  %324 = fsub float -0.000000e+00, %315
+  %325 = fptosi float %324 to i32
+  %326 = fsub float -0.000000e+00, %319
+  %327 = fptosi float %326 to i32
+  %328 = fsub float -0.000000e+00, %323
+  %329 = fptosi float %328 to i32
+  %330 = bitcast i32 %325 to float
+  %331 = bitcast i32 %327 to float
+  %332 = bitcast i32 %329 to float
+  %333 = bitcast float %331 to i32
+  %334 = bitcast float %332 to i32
+  %335 = or i32 %333, %334
+  %336 = bitcast i32 %335 to float
+  %337 = bitcast float %330 to i32
+  %338 = bitcast float %336 to i32
+  %339 = or i32 %337, %338
+  %340 = bitcast i32 %339 to float
+  %341 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
+  %342 = extractelement <4 x float> %341, i32 0
+  %343 = fcmp une float 0xC0119BDA60000000, %342
+  %344 = select i1 %343, float 1.000000e+00, float 0.000000e+00
+  %345 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
+  %346 = extractelement <4 x float> %345, i32 1
+  %347 = fcmp une float 0xC02085D640000000, %346
+  %348 = select i1 %347, float 1.000000e+00, float 0.000000e+00
+  %349 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
+  %350 = extractelement <4 x float> %349, i32 2
+  %351 = fcmp une float 0xBFD7C1BDA0000000, %350
+  %352 = select i1 %351, float 1.000000e+00, float 0.000000e+00
+  %353 = fsub float -0.000000e+00, %344
+  %354 = fptosi float %353 to i32
+  %355 = fsub float -0.000000e+00, %348
+  %356 = fptosi float %355 to i32
+  %357 = fsub float -0.000000e+00, %352
+  %358 = fptosi float %357 to i32
+  %359 = bitcast i32 %354 to float
+  %360 = bitcast i32 %356 to float
+  %361 = bitcast i32 %358 to float
+  %362 = bitcast float %360 to i32
+  %363 = bitcast float %361 to i32
+  %364 = or i32 %362, %363
+  %365 = bitcast i32 %364 to float
+  %366 = bitcast float %359 to i32
+  %367 = bitcast float %365 to i32
+  %368 = or i32 %366, %367
+  %369 = bitcast i32 %368 to float
+  %370 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %371 = extractelement <4 x float> %370, i32 0
+  %372 = fcmp une float 0x401E1D7DC0000000, %371
+  %373 = select i1 %372, float 1.000000e+00, float 0.000000e+00
+  %374 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %375 = extractelement <4 x float> %374, i32 1
+  %376 = fcmp une float 0xC019893740000000, %375
+  %377 = select i1 %376, float 1.000000e+00, float 0.000000e+00
+  %378 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %379 = extractelement <4 x float> %378, i32 2
+  %380 = fcmp une float 0x40220F0D80000000, %379
+  %381 = select i1 %380, float 1.000000e+00, float 0.000000e+00
+  %382 = fsub float -0.000000e+00, %373
+  %383 = fptosi float %382 to i32
+  %384 = fsub float -0.000000e+00, %377
+  %385 = fptosi float %384 to i32
+  %386 = fsub float -0.000000e+00, %381
+  %387 = fptosi float %386 to i32
+  %388 = bitcast i32 %383 to float
+  %389 = bitcast i32 %385 to float
+  %390 = bitcast i32 %387 to float
+  %391 = bitcast float %389 to i32
+  %392 = bitcast float %390 to i32
+  %393 = or i32 %391, %392
+  %394 = bitcast i32 %393 to float
+  %395 = bitcast float %388 to i32
+  %396 = bitcast float %394 to i32
+  %397 = or i32 %395, %396
+  %398 = bitcast i32 %397 to float
+  %399 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
+  %400 = extractelement <4 x float> %399, i32 0
+  %401 = fcmp une float 0xC018E2EB20000000, %400
+  %402 = select i1 %401, float 1.000000e+00, float 0.000000e+00
+  %403 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
+  %404 = extractelement <4 x float> %403, i32 1
+  %405 = fcmp une float 0xBFEA8DB8C0000000, %404
+  %406 = select i1 %405, float 1.000000e+00, float 0.000000e+00
+  %407 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
+  %408 = extractelement <4 x float> %407, i32 2
+  %409 = fcmp une float 0x4015236E20000000, %408
+  %410 = select i1 %409, float 1.000000e+00, float 0.000000e+00
+  %411 = fsub float -0.000000e+00, %402
+  %412 = fptosi float %411 to i32
+  %413 = fsub float -0.000000e+00, %406
+  %414 = fptosi float %413 to i32
+  %415 = fsub float -0.000000e+00, %410
+  %416 = fptosi float %415 to i32
+  %417 = bitcast i32 %412 to float
+  %418 = bitcast i32 %414 to float
+  %419 = bitcast i32 %416 to float
+  %420 = bitcast float %418 to i32
+  %421 = bitcast float %419 to i32
+  %422 = or i32 %420, %421
+  %423 = bitcast i32 %422 to float
+  %424 = bitcast float %417 to i32
+  %425 = bitcast float %423 to i32
+  %426 = or i32 %424, %425
+  %427 = bitcast i32 %426 to float
+  %428 = insertelement <4 x float> undef, float %340, i32 0
+  %429 = insertelement <4 x float> %428, float %369, i32 1
+  %430 = insertelement <4 x float> %429, float %398, i32 2
+  %431 = insertelement <4 x float> %430, float %427, i32 3
+  %432 = insertelement <4 x float> undef, float %340, i32 0
+  %433 = insertelement <4 x float> %432, float %369, i32 1
+  %434 = insertelement <4 x float> %433, float %398, i32 2
+  %435 = insertelement <4 x float> %434, float %427, i32 3
+  %436 = call float @llvm.AMDGPU.dp4(<4 x float> %431, <4 x float> %435)
+  %437 = bitcast float %436 to i32
+  %438 = icmp ne i32 %437, 0
+  %439 = sext i1 %438 to i32
+  %440 = bitcast i32 %439 to float
+  %441 = bitcast float %440 to i32
+  %442 = xor i32 %441, -1
+  %443 = bitcast i32 %442 to float
+  %444 = load <4 x float> addrspace(8)* null
+  %445 = extractelement <4 x float> %444, i32 0
+  %446 = fcmp une float 0xC00574BC60000000, %445
+  %447 = select i1 %446, float 1.000000e+00, float 0.000000e+00
+  %448 = load <4 x float> addrspace(8)* null
+  %449 = extractelement <4 x float> %448, i32 1
+  %450 = fcmp une float 0x40210068E0000000, %449
+  %451 = select i1 %450, float 1.000000e+00, float 0.000000e+00
+  %452 = load <4 x float> addrspace(8)* null
+  %453 = extractelement <4 x float> %452, i32 2
+  %454 = fcmp une float 0xBFC9A6B500000000, %453
+  %455 = select i1 %454, float 1.000000e+00, float 0.000000e+00
+  %456 = load <4 x float> addrspace(8)* null
+  %457 = extractelement <4 x float> %456, i32 3
+  %458 = fcmp une float 0xC0119BDA60000000, %457
+  %459 = select i1 %458, float 1.000000e+00, float 0.000000e+00
+  %460 = fsub float -0.000000e+00, %447
+  %461 = fptosi float %460 to i32
+  %462 = fsub float -0.000000e+00, %451
+  %463 = fptosi float %462 to i32
+  %464 = fsub float -0.000000e+00, %455
+  %465 = fptosi float %464 to i32
+  %466 = fsub float -0.000000e+00, %459
+  %467 = fptosi float %466 to i32
+  %468 = bitcast i32 %461 to float
+  %469 = bitcast i32 %463 to float
+  %470 = bitcast i32 %465 to float
+  %471 = bitcast i32 %467 to float
+  %472 = bitcast float %468 to i32
+  %473 = bitcast float %469 to i32
+  %474 = or i32 %472, %473
+  %475 = bitcast i32 %474 to float
+  %476 = bitcast float %470 to i32
+  %477 = bitcast float %471 to i32
+  %478 = or i32 %476, %477
+  %479 = bitcast i32 %478 to float
+  %480 = bitcast float %475 to i32
+  %481 = bitcast float %479 to i32
+  %482 = or i32 %480, %481
+  %483 = bitcast i32 %482 to float
+  %484 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %485 = extractelement <4 x float> %484, i32 0
+  %486 = fcmp une float 0xC02085D640000000, %485
+  %487 = select i1 %486, float 1.000000e+00, float 0.000000e+00
+  %488 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %489 = extractelement <4 x float> %488, i32 1
+  %490 = fcmp une float 0xBFD7C1BDA0000000, %489
+  %491 = select i1 %490, float 1.000000e+00, float 0.000000e+00
+  %492 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %493 = extractelement <4 x float> %492, i32 2
+  %494 = fcmp une float 0x401E1D7DC0000000, %493
+  %495 = select i1 %494, float 1.000000e+00, float 0.000000e+00
+  %496 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %497 = extractelement <4 x float> %496, i32 3
+  %498 = fcmp une float 0xC019893740000000, %497
+  %499 = select i1 %498, float 1.000000e+00, float 0.000000e+00
+  %500 = fsub float -0.000000e+00, %487
+  %501 = fptosi float %500 to i32
+  %502 = fsub float -0.000000e+00, %491
+  %503 = fptosi float %502 to i32
+  %504 = fsub float -0.000000e+00, %495
+  %505 = fptosi float %504 to i32
+  %506 = fsub float -0.000000e+00, %499
+  %507 = fptosi float %506 to i32
+  %508 = bitcast i32 %501 to float
+  %509 = bitcast i32 %503 to float
+  %510 = bitcast i32 %505 to float
+  %511 = bitcast i32 %507 to float
+  %512 = bitcast float %508 to i32
+  %513 = bitcast float %509 to i32
+  %514 = or i32 %512, %513
+  %515 = bitcast i32 %514 to float
+  %516 = bitcast float %510 to i32
+  %517 = bitcast float %511 to i32
+  %518 = or i32 %516, %517
+  %519 = bitcast i32 %518 to float
+  %520 = bitcast float %515 to i32
+  %521 = bitcast float %519 to i32
+  %522 = or i32 %520, %521
+  %523 = bitcast i32 %522 to float
+  %524 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %525 = extractelement <4 x float> %524, i32 0
+  %526 = fcmp une float 0x40220F0D80000000, %525
+  %527 = select i1 %526, float 1.000000e+00, float 0.000000e+00
+  %528 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %529 = extractelement <4 x float> %528, i32 1
+  %530 = fcmp une float 0xC018E2EB20000000, %529
+  %531 = select i1 %530, float 1.000000e+00, float 0.000000e+00
+  %532 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %533 = extractelement <4 x float> %532, i32 2
+  %534 = fcmp une float 0xBFEA8DB8C0000000, %533
+  %535 = select i1 %534, float 1.000000e+00, float 0.000000e+00
+  %536 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %537 = extractelement <4 x float> %536, i32 3
+  %538 = fcmp une float 0x4015236E20000000, %537
+  %539 = select i1 %538, float 1.000000e+00, float 0.000000e+00
+  %540 = fsub float -0.000000e+00, %527
+  %541 = fptosi float %540 to i32
+  %542 = fsub float -0.000000e+00, %531
+  %543 = fptosi float %542 to i32
+  %544 = fsub float -0.000000e+00, %535
+  %545 = fptosi float %544 to i32
+  %546 = fsub float -0.000000e+00, %539
+  %547 = fptosi float %546 to i32
+  %548 = bitcast i32 %541 to float
+  %549 = bitcast i32 %543 to float
+  %550 = bitcast i32 %545 to float
+  %551 = bitcast i32 %547 to float
+  %552 = bitcast float %548 to i32
+  %553 = bitcast float %549 to i32
+  %554 = or i32 %552, %553
+  %555 = bitcast i32 %554 to float
+  %556 = bitcast float %550 to i32
+  %557 = bitcast float %551 to i32
+  %558 = or i32 %556, %557
+  %559 = bitcast i32 %558 to float
+  %560 = bitcast float %555 to i32
+  %561 = bitcast float %559 to i32
+  %562 = or i32 %560, %561
+  %563 = bitcast i32 %562 to float
+  %564 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %565 = extractelement <4 x float> %564, i32 0
+  %566 = fcmp une float 0x4016ED5D00000000, %565
+  %567 = select i1 %566, float 1.000000e+00, float 0.000000e+00
+  %568 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %569 = extractelement <4 x float> %568, i32 1
+  %570 = fcmp une float 0x402332FEC0000000, %569
+  %571 = select i1 %570, float 1.000000e+00, float 0.000000e+00
+  %572 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %573 = extractelement <4 x float> %572, i32 2
+  %574 = fcmp une float 0xC01484B5E0000000, %573
+  %575 = select i1 %574, float 1.000000e+00, float 0.000000e+00
+  %576 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %577 = extractelement <4 x float> %576, i32 3
+  %578 = fcmp une float 0x400179A6C0000000, %577
+  %579 = select i1 %578, float 1.000000e+00, float 0.000000e+00
+  %580 = fsub float -0.000000e+00, %567
+  %581 = fptosi float %580 to i32
+  %582 = fsub float -0.000000e+00, %571
+  %583 = fptosi float %582 to i32
+  %584 = fsub float -0.000000e+00, %575
+  %585 = fptosi float %584 to i32
+  %586 = fsub float -0.000000e+00, %579
+  %587 = fptosi float %586 to i32
+  %588 = bitcast i32 %581 to float
+  %589 = bitcast i32 %583 to float
+  %590 = bitcast i32 %585 to float
+  %591 = bitcast i32 %587 to float
+  %592 = bitcast float %588 to i32
+  %593 = bitcast float %589 to i32
+  %594 = or i32 %592, %593
+  %595 = bitcast i32 %594 to float
+  %596 = bitcast float %590 to i32
+  %597 = bitcast float %591 to i32
+  %598 = or i32 %596, %597
+  %599 = bitcast i32 %598 to float
+  %600 = bitcast float %595 to i32
+  %601 = bitcast float %599 to i32
+  %602 = or i32 %600, %601
+  %603 = bitcast i32 %602 to float
+  %604 = insertelement <4 x float> undef, float %483, i32 0
+  %605 = insertelement <4 x float> %604, float %523, i32 1
+  %606 = insertelement <4 x float> %605, float %563, i32 2
+  %607 = insertelement <4 x float> %606, float %603, i32 3
+  %608 = insertelement <4 x float> undef, float %483, i32 0
+  %609 = insertelement <4 x float> %608, float %523, i32 1
+  %610 = insertelement <4 x float> %609, float %563, i32 2
+  %611 = insertelement <4 x float> %610, float %603, i32 3
+  %612 = call float @llvm.AMDGPU.dp4(<4 x float> %607, <4 x float> %611)
+  %613 = bitcast float %612 to i32
+  %614 = icmp ne i32 %613, 0
+  %615 = sext i1 %614 to i32
+  %616 = bitcast i32 %615 to float
+  %617 = bitcast float %616 to i32
+  %618 = xor i32 %617, -1
+  %619 = bitcast i32 %618 to float
+  %620 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %621 = extractelement <4 x float> %620, i32 0
+  %622 = fcmp une float 0x40210068E0000000, %621
+  %623 = select i1 %622, float 1.000000e+00, float 0.000000e+00
+  %624 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %625 = extractelement <4 x float> %624, i32 1
+  %626 = fcmp une float 0xBFC9A6B500000000, %625
+  %627 = select i1 %626, float 1.000000e+00, float 0.000000e+00
+  %628 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %629 = extractelement <4 x float> %628, i32 2
+  %630 = fcmp une float 0xC0119BDA60000000, %629
+  %631 = select i1 %630, float 1.000000e+00, float 0.000000e+00
+  %632 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %633 = extractelement <4 x float> %632, i32 3
+  %634 = fcmp une float 0xC02085D640000000, %633
+  %635 = select i1 %634, float 1.000000e+00, float 0.000000e+00
+  %636 = fsub float -0.000000e+00, %623
+  %637 = fptosi float %636 to i32
+  %638 = fsub float -0.000000e+00, %627
+  %639 = fptosi float %638 to i32
+  %640 = fsub float -0.000000e+00, %631
+  %641 = fptosi float %640 to i32
+  %642 = fsub float -0.000000e+00, %635
+  %643 = fptosi float %642 to i32
+  %644 = bitcast i32 %637 to float
+  %645 = bitcast i32 %639 to float
+  %646 = bitcast i32 %641 to float
+  %647 = bitcast i32 %643 to float
+  %648 = bitcast float %644 to i32
+  %649 = bitcast float %645 to i32
+  %650 = or i32 %648, %649
+  %651 = bitcast i32 %650 to float
+  %652 = bitcast float %646 to i32
+  %653 = bitcast float %647 to i32
+  %654 = or i32 %652, %653
+  %655 = bitcast i32 %654 to float
+  %656 = bitcast float %651 to i32
+  %657 = bitcast float %655 to i32
+  %658 = or i32 %656, %657
+  %659 = bitcast i32 %658 to float
+  %660 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %661 = extractelement <4 x float> %660, i32 0
+  %662 = fcmp une float 0xBFD7C1BDA0000000, %661
+  %663 = select i1 %662, float 1.000000e+00, float 0.000000e+00
+  %664 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %665 = extractelement <4 x float> %664, i32 1
+  %666 = fcmp une float 0x401E1D7DC0000000, %665
+  %667 = select i1 %666, float 1.000000e+00, float 0.000000e+00
+  %668 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %669 = extractelement <4 x float> %668, i32 2
+  %670 = fcmp une float 0xC019893740000000, %669
+  %671 = select i1 %670, float 1.000000e+00, float 0.000000e+00
+  %672 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %673 = extractelement <4 x float> %672, i32 3
+  %674 = fcmp une float 0x40220F0D80000000, %673
+  %675 = select i1 %674, float 1.000000e+00, float 0.000000e+00
+  %676 = fsub float -0.000000e+00, %663
+  %677 = fptosi float %676 to i32
+  %678 = fsub float -0.000000e+00, %667
+  %679 = fptosi float %678 to i32
+  %680 = fsub float -0.000000e+00, %671
+  %681 = fptosi float %680 to i32
+  %682 = fsub float -0.000000e+00, %675
+  %683 = fptosi float %682 to i32
+  %684 = bitcast i32 %677 to float
+  %685 = bitcast i32 %679 to float
+  %686 = bitcast i32 %681 to float
+  %687 = bitcast i32 %683 to float
+  %688 = bitcast float %684 to i32
+  %689 = bitcast float %685 to i32
+  %690 = or i32 %688, %689
+  %691 = bitcast i32 %690 to float
+  %692 = bitcast float %686 to i32
+  %693 = bitcast float %687 to i32
+  %694 = or i32 %692, %693
+  %695 = bitcast i32 %694 to float
+  %696 = bitcast float %691 to i32
+  %697 = bitcast float %695 to i32
+  %698 = or i32 %696, %697
+  %699 = bitcast i32 %698 to float
+  %700 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %701 = extractelement <4 x float> %700, i32 0
+  %702 = fcmp une float 0xC018E2EB20000000, %701
+  %703 = select i1 %702, float 1.000000e+00, float 0.000000e+00
+  %704 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %705 = extractelement <4 x float> %704, i32 1
+  %706 = fcmp une float 0xBFEA8DB8C0000000, %705
+  %707 = select i1 %706, float 1.000000e+00, float 0.000000e+00
+  %708 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %709 = extractelement <4 x float> %708, i32 2
+  %710 = fcmp une float 0x4015236E20000000, %709
+  %711 = select i1 %710, float 1.000000e+00, float 0.000000e+00
+  %712 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %713 = extractelement <4 x float> %712, i32 3
+  %714 = fcmp une float 0x4016ED5D00000000, %713
+  %715 = select i1 %714, float 1.000000e+00, float 0.000000e+00
+  %716 = fsub float -0.000000e+00, %703
+  %717 = fptosi float %716 to i32
+  %718 = fsub float -0.000000e+00, %707
+  %719 = fptosi float %718 to i32
+  %720 = fsub float -0.000000e+00, %711
+  %721 = fptosi float %720 to i32
+  %722 = fsub float -0.000000e+00, %715
+  %723 = fptosi float %722 to i32
+  %724 = bitcast i32 %717 to float
+  %725 = bitcast i32 %719 to float
+  %726 = bitcast i32 %721 to float
+  %727 = bitcast i32 %723 to float
+  %728 = bitcast float %724 to i32
+  %729 = bitcast float %725 to i32
+  %730 = or i32 %728, %729
+  %731 = bitcast i32 %730 to float
+  %732 = bitcast float %726 to i32
+  %733 = bitcast float %727 to i32
+  %734 = or i32 %732, %733
+  %735 = bitcast i32 %734 to float
+  %736 = bitcast float %731 to i32
+  %737 = bitcast float %735 to i32
+  %738 = or i32 %736, %737
+  %739 = bitcast i32 %738 to float
+  %740 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %741 = extractelement <4 x float> %740, i32 0
+  %742 = fcmp une float 0x402332FEC0000000, %741
+  %743 = select i1 %742, float 1.000000e+00, float 0.000000e+00
+  %744 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %745 = extractelement <4 x float> %744, i32 1
+  %746 = fcmp une float 0xC01484B5E0000000, %745
+  %747 = select i1 %746, float 1.000000e+00, float 0.000000e+00
+  %748 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %749 = extractelement <4 x float> %748, i32 2
+  %750 = fcmp une float 0x400179A6C0000000, %749
+  %751 = select i1 %750, float 1.000000e+00, float 0.000000e+00
+  %752 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %753 = extractelement <4 x float> %752, i32 3
+  %754 = fcmp une float 0xBFEE752540000000, %753
+  %755 = select i1 %754, float 1.000000e+00, float 0.000000e+00
+  %756 = fsub float -0.000000e+00, %743
+  %757 = fptosi float %756 to i32
+  %758 = fsub float -0.000000e+00, %747
+  %759 = fptosi float %758 to i32
+  %760 = fsub float -0.000000e+00, %751
+  %761 = fptosi float %760 to i32
+  %762 = fsub float -0.000000e+00, %755
+  %763 = fptosi float %762 to i32
+  %764 = bitcast i32 %757 to float
+  %765 = bitcast i32 %759 to float
+  %766 = bitcast i32 %761 to float
+  %767 = bitcast i32 %763 to float
+  %768 = bitcast float %764 to i32
+  %769 = bitcast float %765 to i32
+  %770 = or i32 %768, %769
+  %771 = bitcast i32 %770 to float
+  %772 = bitcast float %766 to i32
+  %773 = bitcast float %767 to i32
+  %774 = or i32 %772, %773
+  %775 = bitcast i32 %774 to float
+  %776 = bitcast float %771 to i32
+  %777 = bitcast float %775 to i32
+  %778 = or i32 %776, %777
+  %779 = bitcast i32 %778 to float
+  %780 = insertelement <4 x float> undef, float %659, i32 0
+  %781 = insertelement <4 x float> %780, float %699, i32 1
+  %782 = insertelement <4 x float> %781, float %739, i32 2
+  %783 = insertelement <4 x float> %782, float %779, i32 3
+  %784 = insertelement <4 x float> undef, float %659, i32 0
+  %785 = insertelement <4 x float> %784, float %699, i32 1
+  %786 = insertelement <4 x float> %785, float %739, i32 2
+  %787 = insertelement <4 x float> %786, float %779, i32 3
+  %788 = call float @llvm.AMDGPU.dp4(<4 x float> %783, <4 x float> %787)
+  %789 = bitcast float %788 to i32
+  %790 = icmp ne i32 %789, 0
+  %791 = sext i1 %790 to i32
+  %792 = bitcast i32 %791 to float
+  %793 = bitcast float %792 to i32
+  %794 = xor i32 %793, -1
+  %795 = bitcast i32 %794 to float
+  %796 = bitcast float %91 to i32
+  %797 = bitcast float %179 to i32
+  %798 = and i32 %796, %797
+  %799 = bitcast i32 %798 to float
+  %800 = bitcast float %311 to i32
+  %801 = bitcast float %443 to i32
+  %802 = and i32 %800, %801
+  %803 = bitcast i32 %802 to float
+  %804 = bitcast float %799 to i32
+  %805 = bitcast float %803 to i32
+  %806 = and i32 %804, %805
+  %807 = bitcast i32 %806 to float
+  %808 = bitcast float %619 to i32
+  %809 = bitcast float %795 to i32
+  %810 = and i32 %808, %809
+  %811 = bitcast i32 %810 to float
+  %812 = bitcast float %807 to i32
+  %813 = bitcast float %811 to i32
+  %814 = and i32 %812, %813
+  %815 = bitcast i32 %814 to float
+  %816 = bitcast float %815 to i32
+  %817 = icmp ne i32 %816, 0
+  %. = select i1 %817, float 1.000000e+00, float 0.000000e+00
+  %.32 = select i1 %817, float 0.000000e+00, float 1.000000e+00
+  %818 = insertelement <4 x float> undef, float %0, i32 0
+  %819 = insertelement <4 x float> %818, float %1, i32 1
+  %820 = insertelement <4 x float> %819, float %2, i32 2
+  %821 = insertelement <4 x float> %820, float %3, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %821, i32 60, i32 1)
+  %822 = insertelement <4 x float> undef, float %.32, i32 0
+  %823 = insertelement <4 x float> %822, float %., i32 1
+  %824 = insertelement <4 x float> %823, float 0.000000e+00, i32 2
+  %825 = insertelement <4 x float> %824, float 1.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %825, i32 0, i32 2)
+  ret void
+}
+
+declare float @llvm.R600.load.input(i32) #1
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }