mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 20:32:21 +00:00
Preserve IR flags (nsw, nuw, exact, fast-math) in SLP vectorizer (PR20802).
The SLP vectorizer should propagate IR-level optimization hints/flags (nsw, nuw, exact, fast-math) when converting scalar instructions into vectors. But this isn't a simple copy - we need to take the intersection (the logical 'and') of the sets of flags on the scalars. The solution is further complicated because we can have non-uniform (non-SIMD) vector ops after: http://reviews.llvm.org/D4015 http://llvm.org/viewvc/llvm-project?view=revision&revision=211339 The vast majority of changed files are existing tests that were not propagating IR flags, but I've also added a new test file for focused testing of IR flag possibilities. Differential Revision: http://reviews.llvm.org/D5172 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217051 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
6d66a1cd2f
commit
b89304eb7c
@ -166,6 +166,23 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
|
||||
return Opcode;
|
||||
}
|
||||
|
||||
/// Get the intersection (logical and) of all of the potential IR flags
|
||||
/// of each scalar operation (VL) that will be converted into a vector (I).
|
||||
/// Flag set: NSW, NUW, exact, and all of fast-math.
|
||||
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
|
||||
if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
|
||||
if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
|
||||
// Intersection is initialized to the 0th scalar,
|
||||
// so start counting from index '1'.
|
||||
for (int i = 1, e = VL.size(); i < e; ++i) {
|
||||
if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
|
||||
Intersection->andIRFlags(Scalar);
|
||||
}
|
||||
VecOp->copyIRFlags(Intersection);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// \returns \p I after propagating metadata from \p VL.
|
||||
static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
|
||||
Instruction *I0 = cast<Instruction>(VL[0]);
|
||||
@ -2031,6 +2048,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
||||
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
|
||||
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
|
||||
E->VectorizedValue = V;
|
||||
propagateIRFlags(E->VectorizedValue, E->Scalars);
|
||||
++NumVectorInstructions;
|
||||
|
||||
if (Instruction *I = dyn_cast<Instruction>(V))
|
||||
@ -2194,18 +2212,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
||||
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
|
||||
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
|
||||
|
||||
// Create appropriate shuffle to take alternative operations from
|
||||
// the vector.
|
||||
std::vector<Constant *> Mask(E->Scalars.size());
|
||||
// Create shuffle to take alternate operations from the vector.
|
||||
// Also, gather up odd and even scalar ops to propagate IR flags to
|
||||
// each vector operation.
|
||||
ValueList OddScalars, EvenScalars;
|
||||
unsigned e = E->Scalars.size();
|
||||
SmallVector<Constant *, 8> Mask(e);
|
||||
for (unsigned i = 0; i < e; ++i) {
|
||||
if (i & 1)
|
||||
if (i & 1) {
|
||||
Mask[i] = Builder.getInt32(e + i);
|
||||
else
|
||||
OddScalars.push_back(E->Scalars[i]);
|
||||
} else {
|
||||
Mask[i] = Builder.getInt32(i);
|
||||
EvenScalars.push_back(E->Scalars[i]);
|
||||
}
|
||||
}
|
||||
|
||||
Value *ShuffleMask = ConstantVector::get(Mask);
|
||||
propagateIRFlags(V0, EvenScalars);
|
||||
propagateIRFlags(V1, OddScalars);
|
||||
|
||||
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
|
||||
E->VectorizedValue = V;
|
||||
|
@ -10,8 +10,8 @@ define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
|
||||
; CHECK: %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1
|
||||
; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
|
||||
; CHECK: %4 = load <2 x float>* %3, align 4
|
||||
; CHECK: %5 = fsub <2 x float> %2, %4
|
||||
; CHECK: %6 = fmul <2 x float> %5, %5
|
||||
; CHECK: %5 = fsub fast <2 x float> %2, %4
|
||||
; CHECK: %6 = fmul fast <2 x float> %5, %5
|
||||
; CHECK: %7 = extractelement <2 x float> %6, i32 0
|
||||
; CHECK: %8 = extractelement <2 x float> %6, i32 1
|
||||
; CHECK: %add = fadd fast float %7, %8
|
||||
@ -45,8 +45,8 @@ define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
|
||||
; CHECK: %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1
|
||||
; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
|
||||
; CHECK: %4 = load <2 x float>* %3, align 4
|
||||
; CHECK: %5 = fsub <2 x float> %2, %4
|
||||
; CHECK: %6 = fmul <2 x float> %5, %5
|
||||
; CHECK: %5 = fsub fast <2 x float> %2, %4
|
||||
; CHECK: %6 = fmul fast <2 x float> %5, %5
|
||||
; CHECK: %7 = extractelement <2 x float> %6, i32 0
|
||||
; CHECK: %8 = extractelement <2 x float> %6, i32 1
|
||||
; CHECK: %add = fadd fast float %8, %7
|
||||
|
@ -5,11 +5,11 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
|
||||
%class.Complex = type { double, double }
|
||||
|
||||
; Code like this is the result of SROA. Make sure we don't vectorize this
|
||||
; because the in the scalar version of this the shl/or are handled by the
|
||||
; because the scalar version of the shl/or are handled by the
|
||||
; backend and disappear, the vectorized code stays.
|
||||
|
||||
; CHECK-LABEL: SROAed
|
||||
; CHECK-NOT: shl <2 x i64>
|
||||
; CHECK-NOT: shl nuw <2 x i64>
|
||||
; CHECK-NOT: or <2 x i64>
|
||||
|
||||
define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) {
|
||||
|
@ -12,9 +12,9 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
@fa = common global [4 x float] zeroinitializer, align 16
|
||||
|
||||
; CHECK-LABEL: @addsub
|
||||
; CHECK: %5 = add <4 x i32> %3, %4
|
||||
; CHECK: %6 = add <4 x i32> %2, %5
|
||||
; CHECK: %7 = sub <4 x i32> %2, %5
|
||||
; CHECK: %5 = add nsw <4 x i32> %3, %4
|
||||
; CHECK: %6 = add nsw <4 x i32> %2, %5
|
||||
; CHECK: %7 = sub nsw <4 x i32> %2, %5
|
||||
; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
@ -56,9 +56,9 @@ entry:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @subadd
|
||||
; CHECK: %5 = add <4 x i32> %3, %4
|
||||
; CHECK: %6 = sub <4 x i32> %2, %5
|
||||
; CHECK: %7 = add <4 x i32> %2, %5
|
||||
; CHECK: %5 = add nsw <4 x i32> %3, %4
|
||||
; CHECK: %6 = sub nsw <4 x i32> %2, %5
|
||||
; CHECK: %7 = add nsw <4 x i32> %2, %5
|
||||
; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
|
@ -15,7 +15,7 @@ target triple = "x86_64-apple-macosx10.9.0"
|
||||
;CHECK: bitcast i32* %A to <4 x i32>*
|
||||
;CHECK-NEXT: load <4 x i32>
|
||||
;CHECK: phi <4 x i32>
|
||||
;CHECK-NEXT: mul <4 x i32>
|
||||
;CHECK-NEXT: mul nsw <4 x i32>
|
||||
;CHECK-NOT: mul
|
||||
;CHECK: phi <4 x i32>
|
||||
;CHECK: bitcast i32* %A to <4 x i32>*
|
||||
|
@ -21,7 +21,7 @@ target triple = "i386-apple-macosx10.9.0"
|
||||
; loop body:
|
||||
;CHECK: phi
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret
|
||||
define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
|
||||
|
@ -148,7 +148,7 @@ for.end:
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: long_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: fmul fast <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
|
||||
@ -250,7 +250,7 @@ for.end:
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: chain_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: fmul fast <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
|
||||
@ -317,7 +317,7 @@ for.end:
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: store_red
|
||||
; CHECK: fmul <4 x float>
|
||||
; CHECK: fmul fast <4 x float>
|
||||
; CHECK: shufflevector <4 x float>
|
||||
|
||||
define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
|
||||
@ -379,7 +379,7 @@ for.end:
|
||||
; }
|
||||
|
||||
; STORE-LABEL: store_red_double
|
||||
; STORE: fmul <2 x double>
|
||||
; STORE: fmul fast <2 x double>
|
||||
; STORE: extractelement <2 x double>
|
||||
; STORE: extractelement <2 x double>
|
||||
|
||||
|
@ -5,10 +5,10 @@ target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
;CHECK-LABEL: @foo(
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret
|
||||
define i32 @foo(i32* nocapture %A, i32 %n) #0 {
|
||||
|
@ -14,7 +14,7 @@ target triple = "x86_64-apple-macosx10.7.0"
|
||||
;CHECK-LABEL: @foo(
|
||||
;CHECK: insertelement <4 x i32>
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: store <4 x i32>
|
||||
;CHECK: ret
|
||||
define i32 @foo(i32* nocapture %A, i32 %n) {
|
||||
|
@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
;CHECK-LABEL: @powof2div(
|
||||
;CHECK: load <4 x i32>*
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: sdiv <4 x i32>
|
||||
define void @powof2div(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
|
||||
entry:
|
||||
|
350
test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
Normal file
350
test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
Normal file
@ -0,0 +1,350 @@
|
||||
; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
|
||||
|
||||
; Check propagation of optional IR flags (PR20802). For a flag to
|
||||
; propagate from scalar instructions to their vector replacement,
|
||||
; *all* scalar instructions must have the flag.
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-unknown"
|
||||
|
||||
; CHECK-LABEL: @exact(
|
||||
; CHECK: lshr exact <4 x i32>
|
||||
define void @exact(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = lshr exact i32 %load1, 1
|
||||
%op2 = lshr exact i32 %load2, 1
|
||||
%op3 = lshr exact i32 %load3, 1
|
||||
%op4 = lshr exact i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @not_exact(
|
||||
; CHECK: lshr <4 x i32>
|
||||
define void @not_exact(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = lshr exact i32 %load1, 1
|
||||
%op2 = lshr i32 %load2, 1
|
||||
%op3 = lshr exact i32 %load3, 1
|
||||
%op4 = lshr exact i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @nsw(
|
||||
; CHECK: add nsw <4 x i32>
|
||||
define void @nsw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add nsw i32 %load1, 1
|
||||
%op2 = add nsw i32 %load2, 1
|
||||
%op3 = add nsw i32 %load3, 1
|
||||
%op4 = add nsw i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @not_nsw(
|
||||
; CHECK: add <4 x i32>
|
||||
define void @not_nsw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add nsw i32 %load1, 1
|
||||
%op2 = add nsw i32 %load2, 1
|
||||
%op3 = add nsw i32 %load3, 1
|
||||
%op4 = add i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @nuw(
|
||||
; CHECK: add nuw <4 x i32>
|
||||
define void @nuw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add nuw i32 %load1, 1
|
||||
%op2 = add nuw i32 %load2, 1
|
||||
%op3 = add nuw i32 %load3, 1
|
||||
%op4 = add nuw i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @not_nuw(
|
||||
; CHECK: add <4 x i32>
|
||||
define void @not_nuw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add nuw i32 %load1, 1
|
||||
%op2 = add i32 %load2, 1
|
||||
%op3 = add i32 %load3, 1
|
||||
%op4 = add nuw i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @nnan(
|
||||
; CHECK: fadd nnan <4 x float>
|
||||
define void @nnan(float* %x) {
|
||||
%idx1 = getelementptr inbounds float* %x, i64 0
|
||||
%idx2 = getelementptr inbounds float* %x, i64 1
|
||||
%idx3 = getelementptr inbounds float* %x, i64 2
|
||||
%idx4 = getelementptr inbounds float* %x, i64 3
|
||||
|
||||
%load1 = load float* %idx1, align 4
|
||||
%load2 = load float* %idx2, align 4
|
||||
%load3 = load float* %idx3, align 4
|
||||
%load4 = load float* %idx4, align 4
|
||||
|
||||
%op1 = fadd fast nnan float %load1, 1.0
|
||||
%op2 = fadd nnan ninf float %load2, 1.0
|
||||
%op3 = fadd nsz nnan float %load3, 1.0
|
||||
%op4 = fadd arcp nnan float %load4, 1.0
|
||||
|
||||
store float %op1, float* %idx1, align 4
|
||||
store float %op2, float* %idx2, align 4
|
||||
store float %op3, float* %idx3, align 4
|
||||
store float %op4, float* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @not_nnan(
|
||||
; CHECK: fadd <4 x float>
|
||||
define void @not_nnan(float* %x) {
|
||||
%idx1 = getelementptr inbounds float* %x, i64 0
|
||||
%idx2 = getelementptr inbounds float* %x, i64 1
|
||||
%idx3 = getelementptr inbounds float* %x, i64 2
|
||||
%idx4 = getelementptr inbounds float* %x, i64 3
|
||||
|
||||
%load1 = load float* %idx1, align 4
|
||||
%load2 = load float* %idx2, align 4
|
||||
%load3 = load float* %idx3, align 4
|
||||
%load4 = load float* %idx4, align 4
|
||||
|
||||
%op1 = fadd nnan float %load1, 1.0
|
||||
%op2 = fadd ninf float %load2, 1.0
|
||||
%op3 = fadd nsz float %load3, 1.0
|
||||
%op4 = fadd arcp float %load4, 1.0
|
||||
|
||||
store float %op1, float* %idx1, align 4
|
||||
store float %op2, float* %idx2, align 4
|
||||
store float %op3, float* %idx3, align 4
|
||||
store float %op4, float* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @only_fast(
|
||||
; CHECK: fadd fast <4 x float>
|
||||
define void @only_fast(float* %x) {
|
||||
%idx1 = getelementptr inbounds float* %x, i64 0
|
||||
%idx2 = getelementptr inbounds float* %x, i64 1
|
||||
%idx3 = getelementptr inbounds float* %x, i64 2
|
||||
%idx4 = getelementptr inbounds float* %x, i64 3
|
||||
|
||||
%load1 = load float* %idx1, align 4
|
||||
%load2 = load float* %idx2, align 4
|
||||
%load3 = load float* %idx3, align 4
|
||||
%load4 = load float* %idx4, align 4
|
||||
|
||||
%op1 = fadd fast nnan float %load1, 1.0
|
||||
%op2 = fadd fast nnan ninf float %load2, 1.0
|
||||
%op3 = fadd fast nsz nnan float %load3, 1.0
|
||||
%op4 = fadd arcp nnan fast float %load4, 1.0
|
||||
|
||||
store float %op1, float* %idx1, align 4
|
||||
store float %op2, float* %idx2, align 4
|
||||
store float %op3, float* %idx3, align 4
|
||||
store float %op4, float* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @only_arcp(
|
||||
; CHECK: fadd arcp <4 x float>
|
||||
define void @only_arcp(float* %x) {
|
||||
%idx1 = getelementptr inbounds float* %x, i64 0
|
||||
%idx2 = getelementptr inbounds float* %x, i64 1
|
||||
%idx3 = getelementptr inbounds float* %x, i64 2
|
||||
%idx4 = getelementptr inbounds float* %x, i64 3
|
||||
|
||||
%load1 = load float* %idx1, align 4
|
||||
%load2 = load float* %idx2, align 4
|
||||
%load3 = load float* %idx3, align 4
|
||||
%load4 = load float* %idx4, align 4
|
||||
|
||||
%op1 = fadd fast float %load1, 1.0
|
||||
%op2 = fadd fast float %load2, 1.0
|
||||
%op3 = fadd fast float %load3, 1.0
|
||||
%op4 = fadd arcp float %load4, 1.0
|
||||
|
||||
store float %op1, float* %idx1, align 4
|
||||
store float %op2, float* %idx2, align 4
|
||||
store float %op3, float* %idx3, align 4
|
||||
store float %op4, float* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @addsub_all_nsw
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: sub nsw <4 x i32>
|
||||
define void @addsub_all_nsw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add nsw i32 %load1, 1
|
||||
%op2 = sub nsw i32 %load2, 1
|
||||
%op3 = add nsw i32 %load3, 1
|
||||
%op4 = sub nsw i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @addsub_some_nsw
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: sub <4 x i32>
|
||||
define void @addsub_some_nsw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add nsw i32 %load1, 1
|
||||
%op2 = sub nsw i32 %load2, 1
|
||||
%op3 = add nsw i32 %load3, 1
|
||||
%op4 = sub i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @addsub_no_nsw
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: sub <4 x i32>
|
||||
define void @addsub_no_nsw(i32* %x) {
|
||||
%idx1 = getelementptr inbounds i32* %x, i64 0
|
||||
%idx2 = getelementptr inbounds i32* %x, i64 1
|
||||
%idx3 = getelementptr inbounds i32* %x, i64 2
|
||||
%idx4 = getelementptr inbounds i32* %x, i64 3
|
||||
|
||||
%load1 = load i32* %idx1, align 4
|
||||
%load2 = load i32* %idx2, align 4
|
||||
%load3 = load i32* %idx3, align 4
|
||||
%load4 = load i32* %idx4, align 4
|
||||
|
||||
%op1 = add i32 %load1, 1
|
||||
%op2 = sub nsw i32 %load2, 1
|
||||
%op3 = add nsw i32 %load3, 1
|
||||
%op4 = sub i32 %load4, 1
|
||||
|
||||
store i32 %op1, i32* %idx1, align 4
|
||||
store i32 %op2, i32* %idx2, align 4
|
||||
store i32 %op3, i32* %idx3, align 4
|
||||
store i32 %op4, i32* %idx4, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.8.0"
|
||||
|
||||
; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf
|
||||
;CHECK: SAXPY
|
||||
;CHECK: mul <4 x i32>
|
||||
;CHECK: mul nsw <4 x i32>
|
||||
;CHECK: ret
|
||||
|
||||
define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) {
|
||||
|
@ -6,7 +6,7 @@ target triple = "x86_64-apple-macosx10.9.0"
|
||||
;CHECK-LABEL: @foo
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: %[[S1:.+]] = add <4 x i32>
|
||||
;CHECK: %[[S1:.+]] = add nsw <4 x i32>
|
||||
;CHECK-DAG: store <4 x i32> %[[S1]]
|
||||
;CHECK-DAG: %[[A1:.+]] = add nsw i32
|
||||
;CHECK-DAG: %[[A2:.+]] = add nsw i32 %[[A1]]
|
||||
|
Loading…
Reference in New Issue
Block a user