mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-15 07:34:33 +00:00
Fix Operandreorder logic in SLPVectorizer to generate longer vectorizable chain.
This patch fixes 2 issues in reorderInputsAccordingToOpcode 1) AllSameOpcodeLeft and AllSameOpcodeRight was being calculated incorrectly resulting in code not being vectorized in few cases. 2) Adds logic to reorder operands if we get longer chain of consecutive loads enabling vectorization. Handled the same for cases were we have AltOpcode. Thanks Michael for inputs and review. Review: http://reviews.llvm.org/D6677 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226547 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
01b2fd3f97
commit
7e9f120130
@ -268,104 +268,6 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
|
|
||||||
SmallVectorImpl<Value *> &Left,
|
|
||||||
SmallVectorImpl<Value *> &Right) {
|
|
||||||
|
|
||||||
SmallVector<Value *, 16> OrigLeft, OrigRight;
|
|
||||||
|
|
||||||
bool AllSameOpcodeLeft = true;
|
|
||||||
bool AllSameOpcodeRight = true;
|
|
||||||
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
|
|
||||||
Instruction *I = cast<Instruction>(VL[i]);
|
|
||||||
Value *V0 = I->getOperand(0);
|
|
||||||
Value *V1 = I->getOperand(1);
|
|
||||||
|
|
||||||
OrigLeft.push_back(V0);
|
|
||||||
OrigRight.push_back(V1);
|
|
||||||
|
|
||||||
Instruction *I0 = dyn_cast<Instruction>(V0);
|
|
||||||
Instruction *I1 = dyn_cast<Instruction>(V1);
|
|
||||||
|
|
||||||
// Check whether all operands on one side have the same opcode. In this case
|
|
||||||
// we want to preserve the original order and not make things worse by
|
|
||||||
// reordering.
|
|
||||||
AllSameOpcodeLeft = I0;
|
|
||||||
AllSameOpcodeRight = I1;
|
|
||||||
|
|
||||||
if (i && AllSameOpcodeLeft) {
|
|
||||||
if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
|
|
||||||
if(P0->getOpcode() != I0->getOpcode())
|
|
||||||
AllSameOpcodeLeft = false;
|
|
||||||
} else
|
|
||||||
AllSameOpcodeLeft = false;
|
|
||||||
}
|
|
||||||
if (i && AllSameOpcodeRight) {
|
|
||||||
if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
|
|
||||||
if(P1->getOpcode() != I1->getOpcode())
|
|
||||||
AllSameOpcodeRight = false;
|
|
||||||
} else
|
|
||||||
AllSameOpcodeRight = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort two opcodes. In the code below we try to preserve the ability to use
|
|
||||||
// broadcast of values instead of individual inserts.
|
|
||||||
// vl1 = load
|
|
||||||
// vl2 = phi
|
|
||||||
// vr1 = load
|
|
||||||
// vr2 = vr2
|
|
||||||
// = vl1 x vr1
|
|
||||||
// = vl2 x vr2
|
|
||||||
// If we just sorted according to opcode we would leave the first line in
|
|
||||||
// tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
|
|
||||||
// = vl1 x vr1
|
|
||||||
// = vr2 x vl2
|
|
||||||
// Because vr2 and vr1 are from the same load we loose the opportunity of a
|
|
||||||
// broadcast for the packed right side in the backend: we have [vr1, vl2]
|
|
||||||
// instead of [vr1, vr2=vr1].
|
|
||||||
if (I0 && I1) {
|
|
||||||
if(!i && I0->getOpcode() > I1->getOpcode()) {
|
|
||||||
Left.push_back(I1);
|
|
||||||
Right.push_back(I0);
|
|
||||||
} else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
|
|
||||||
// Try not to destroy a broad cast for no apparent benefit.
|
|
||||||
Left.push_back(I1);
|
|
||||||
Right.push_back(I0);
|
|
||||||
} else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) {
|
|
||||||
// Try preserve broadcasts.
|
|
||||||
Left.push_back(I1);
|
|
||||||
Right.push_back(I0);
|
|
||||||
} else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
|
|
||||||
// Try preserve broadcasts.
|
|
||||||
Left.push_back(I1);
|
|
||||||
Right.push_back(I0);
|
|
||||||
} else {
|
|
||||||
Left.push_back(I0);
|
|
||||||
Right.push_back(I1);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// One opcode, put the instruction on the right.
|
|
||||||
if (I0) {
|
|
||||||
Left.push_back(V1);
|
|
||||||
Right.push_back(I0);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Left.push_back(V0);
|
|
||||||
Right.push_back(V1);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LeftBroadcast = isSplat(Left);
|
|
||||||
bool RightBroadcast = isSplat(Right);
|
|
||||||
|
|
||||||
// Don't reorder if the operands where good to begin with.
|
|
||||||
if (!(LeftBroadcast || RightBroadcast) &&
|
|
||||||
(AllSameOpcodeRight || AllSameOpcodeLeft)) {
|
|
||||||
Left = OrigLeft;
|
|
||||||
Right = OrigRight;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// \returns True if in-tree use also needs extract. This refers to
|
/// \returns True if in-tree use also needs extract. This refers to
|
||||||
/// possible scalar operand in vectorized instruction.
|
/// possible scalar operand in vectorized instruction.
|
||||||
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
|
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
|
||||||
@ -508,6 +410,16 @@ private:
|
|||||||
/// be beneficial even the tree height is tiny.
|
/// be beneficial even the tree height is tiny.
|
||||||
bool isFullyVectorizableTinyTree();
|
bool isFullyVectorizableTinyTree();
|
||||||
|
|
||||||
|
/// \reorder commutative operands in alt shuffle if they result in
|
||||||
|
/// vectorized code.
|
||||||
|
void reorderAltShuffleOperands(ArrayRef<Value *> VL,
|
||||||
|
SmallVectorImpl<Value *> &Left,
|
||||||
|
SmallVectorImpl<Value *> &Right);
|
||||||
|
/// \reorder commutative operands to get better probability of
|
||||||
|
/// generating vectorized code.
|
||||||
|
void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
|
||||||
|
SmallVectorImpl<Value *> &Left,
|
||||||
|
SmallVectorImpl<Value *> &Right);
|
||||||
struct TreeEntry {
|
struct TreeEntry {
|
||||||
TreeEntry() : Scalars(), VectorizedValue(nullptr),
|
TreeEntry() : Scalars(), VectorizedValue(nullptr),
|
||||||
NeedToGather(0) {}
|
NeedToGather(0) {}
|
||||||
@ -1441,6 +1353,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||||||
}
|
}
|
||||||
newTreeEntry(VL, true);
|
newTreeEntry(VL, true);
|
||||||
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
|
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
|
||||||
|
|
||||||
|
// Reorder operands if reordering would enable vectorization.
|
||||||
|
if (isa<BinaryOperator>(VL0)) {
|
||||||
|
ValueList Left, Right;
|
||||||
|
reorderAltShuffleOperands(VL, Left, Right);
|
||||||
|
buildTree_rec(Left, Depth + 1);
|
||||||
|
buildTree_rec(Right, Depth + 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
||||||
ValueList Operands;
|
ValueList Operands;
|
||||||
// Prepare the operand vector.
|
// Prepare the operand vector.
|
||||||
@ -1878,6 +1800,195 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
|
|||||||
return X == PtrSCEVB;
|
return X == PtrSCEVB;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reorder commutative operations in alternate shuffle if the resulting vectors
|
||||||
|
// are consecutive loads. This would allow us to vectorize the tree.
|
||||||
|
// If we have something like-
|
||||||
|
// load a[0] - load b[0]
|
||||||
|
// load b[1] + load a[1]
|
||||||
|
// load a[2] - load b[2]
|
||||||
|
// load a[3] + load b[3]
|
||||||
|
// Reordering the second load b[1] load a[1] would allow us to vectorize this
|
||||||
|
// code.
|
||||||
|
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
|
||||||
|
SmallVectorImpl<Value *> &Left,
|
||||||
|
SmallVectorImpl<Value *> &Right) {
|
||||||
|
|
||||||
|
// Push left and right operands of binary operation into Left and Right
|
||||||
|
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
||||||
|
Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
|
||||||
|
Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reorder if we have a commutative operation and consecutive access
|
||||||
|
// are on either side of the alternate instructions.
|
||||||
|
for (unsigned j = 0; j < VL.size() - 1; ++j) {
|
||||||
|
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
|
||||||
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
|
||||||
|
Instruction *VL1 = cast<Instruction>(VL[j]);
|
||||||
|
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
|
||||||
|
if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
|
||||||
|
std::swap(Left[j], Right[j]);
|
||||||
|
continue;
|
||||||
|
} else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
|
||||||
|
std::swap(Left[j + 1], Right[j + 1]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// else unchanged
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
|
||||||
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
|
||||||
|
Instruction *VL1 = cast<Instruction>(VL[j]);
|
||||||
|
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
|
||||||
|
if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
|
||||||
|
std::swap(Left[j], Right[j]);
|
||||||
|
continue;
|
||||||
|
} else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
|
||||||
|
std::swap(Left[j + 1], Right[j + 1]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// else unchanged
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
|
||||||
|
SmallVectorImpl<Value *> &Left,
|
||||||
|
SmallVectorImpl<Value *> &Right) {
|
||||||
|
|
||||||
|
SmallVector<Value *, 16> OrigLeft, OrigRight;
|
||||||
|
|
||||||
|
bool AllSameOpcodeLeft = true;
|
||||||
|
bool AllSameOpcodeRight = true;
|
||||||
|
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
|
||||||
|
Instruction *I = cast<Instruction>(VL[i]);
|
||||||
|
Value *VLeft = I->getOperand(0);
|
||||||
|
Value *VRight = I->getOperand(1);
|
||||||
|
|
||||||
|
OrigLeft.push_back(VLeft);
|
||||||
|
OrigRight.push_back(VRight);
|
||||||
|
|
||||||
|
Instruction *ILeft = dyn_cast<Instruction>(VLeft);
|
||||||
|
Instruction *IRight = dyn_cast<Instruction>(VRight);
|
||||||
|
|
||||||
|
// Check whether all operands on one side have the same opcode. In this case
|
||||||
|
// we want to preserve the original order and not make things worse by
|
||||||
|
// reordering.
|
||||||
|
if (i && AllSameOpcodeLeft && ILeft) {
|
||||||
|
if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
|
||||||
|
if (PLeft->getOpcode() != ILeft->getOpcode())
|
||||||
|
AllSameOpcodeLeft = false;
|
||||||
|
} else
|
||||||
|
AllSameOpcodeLeft = false;
|
||||||
|
}
|
||||||
|
if (i && AllSameOpcodeRight && IRight) {
|
||||||
|
if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
|
||||||
|
if (PRight->getOpcode() != IRight->getOpcode())
|
||||||
|
AllSameOpcodeRight = false;
|
||||||
|
} else
|
||||||
|
AllSameOpcodeRight = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort two opcodes. In the code below we try to preserve the ability to use
|
||||||
|
// broadcast of values instead of individual inserts.
|
||||||
|
// vl1 = load
|
||||||
|
// vl2 = phi
|
||||||
|
// vr1 = load
|
||||||
|
// vr2 = vr2
|
||||||
|
// = vl1 x vr1
|
||||||
|
// = vl2 x vr2
|
||||||
|
// If we just sorted according to opcode we would leave the first line in
|
||||||
|
// tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
|
||||||
|
// = vl1 x vr1
|
||||||
|
// = vr2 x vl2
|
||||||
|
// Because vr2 and vr1 are from the same load we loose the opportunity of a
|
||||||
|
// broadcast for the packed right side in the backend: we have [vr1, vl2]
|
||||||
|
// instead of [vr1, vr2=vr1].
|
||||||
|
if (ILeft && IRight) {
|
||||||
|
if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
|
||||||
|
Left.push_back(IRight);
|
||||||
|
Right.push_back(ILeft);
|
||||||
|
} else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
|
||||||
|
Right[i - 1] != IRight) {
|
||||||
|
// Try not to destroy a broad cast for no apparent benefit.
|
||||||
|
Left.push_back(IRight);
|
||||||
|
Right.push_back(ILeft);
|
||||||
|
} else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
|
||||||
|
Right[i - 1] == ILeft) {
|
||||||
|
// Try preserve broadcasts.
|
||||||
|
Left.push_back(IRight);
|
||||||
|
Right.push_back(ILeft);
|
||||||
|
} else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
|
||||||
|
Left[i - 1] == IRight) {
|
||||||
|
// Try preserve broadcasts.
|
||||||
|
Left.push_back(IRight);
|
||||||
|
Right.push_back(ILeft);
|
||||||
|
} else {
|
||||||
|
Left.push_back(ILeft);
|
||||||
|
Right.push_back(IRight);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// One opcode, put the instruction on the right.
|
||||||
|
if (ILeft) {
|
||||||
|
Left.push_back(VRight);
|
||||||
|
Right.push_back(ILeft);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Left.push_back(VLeft);
|
||||||
|
Right.push_back(VRight);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LeftBroadcast = isSplat(Left);
|
||||||
|
bool RightBroadcast = isSplat(Right);
|
||||||
|
|
||||||
|
// If operands end up being broadcast return this operand order.
|
||||||
|
if (LeftBroadcast || RightBroadcast)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Don't reorder if the operands where good to begin.
|
||||||
|
if (AllSameOpcodeRight || AllSameOpcodeLeft) {
|
||||||
|
Left = OrigLeft;
|
||||||
|
Right = OrigRight;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally check if we can get longer vectorizable chain by reordering
|
||||||
|
// without breaking the good operand order detected above.
|
||||||
|
// E.g. If we have something like-
|
||||||
|
// load a[0] load b[0]
|
||||||
|
// load b[1] load a[1]
|
||||||
|
// load a[2] load b[2]
|
||||||
|
// load a[3] load b[3]
|
||||||
|
// Reordering the second load b[1] load a[1] would allow us to vectorize
|
||||||
|
// this code and we still retain AllSameOpcode property.
|
||||||
|
// FIXME: This load reordering might break AllSameOpcode in some rare cases
|
||||||
|
// such as-
|
||||||
|
// add a[0],c[0] load b[0]
|
||||||
|
// add a[1],c[2] load b[1]
|
||||||
|
// b[2] load b[2]
|
||||||
|
// add a[3],c[3] load b[3]
|
||||||
|
for (unsigned j = 0; j < VL.size() - 1; ++j) {
|
||||||
|
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
|
||||||
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
|
||||||
|
if (isConsecutiveAccess(L, L1)) {
|
||||||
|
std::swap(Left[j + 1], Right[j + 1]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
|
||||||
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
|
||||||
|
if (isConsecutiveAccess(L, L1)) {
|
||||||
|
std::swap(Left[j + 1], Right[j + 1]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// else unchanged
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
|
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
|
||||||
Instruction *VL0 = cast<Instruction>(VL[0]);
|
Instruction *VL0 = cast<Instruction>(VL[0]);
|
||||||
BasicBlock::iterator NextInst = VL0;
|
BasicBlock::iterator NextInst = VL0;
|
||||||
@ -2274,10 +2385,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||||||
}
|
}
|
||||||
case Instruction::ShuffleVector: {
|
case Instruction::ShuffleVector: {
|
||||||
ValueList LHSVL, RHSVL;
|
ValueList LHSVL, RHSVL;
|
||||||
for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
|
assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
|
||||||
LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
|
reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
|
||||||
RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
|
|
||||||
}
|
|
||||||
setInsertPointAfterBundle(E->Scalars);
|
setInsertPointAfterBundle(E->Scalars);
|
||||||
|
|
||||||
Value *LHS = vectorizeTree(LHSVL);
|
Value *LHS = vectorizeTree(LHSVL);
|
||||||
|
@ -10,6 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
|
|||||||
@fb = common global [4 x float] zeroinitializer, align 16
|
@fb = common global [4 x float] zeroinitializer, align 16
|
||||||
@fc = common global [4 x float] zeroinitializer, align 16
|
@fc = common global [4 x float] zeroinitializer, align 16
|
||||||
@fa = common global [4 x float] zeroinitializer, align 16
|
@fa = common global [4 x float] zeroinitializer, align 16
|
||||||
|
@fd = common global [4 x float] zeroinitializer, align 16
|
||||||
|
|
||||||
; CHECK-LABEL: @addsub
|
; CHECK-LABEL: @addsub
|
||||||
; CHECK: %5 = add nsw <4 x i32> %3, %4
|
; CHECK: %5 = add nsw <4 x i32> %3, %4
|
||||||
@ -177,5 +178,137 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; Check vectorization of following code for float data type-
|
||||||
|
; fc[0] = fb[0]+fa[0]; //swapped fb and fa
|
||||||
|
; fc[1] = fa[1]-fb[1];
|
||||||
|
; fc[2] = fa[2]+fb[2];
|
||||||
|
; fc[3] = fa[3]-fb[3];
|
||||||
|
|
||||||
|
; CHECK-LABEL: @reorder_alt
|
||||||
|
; CHECK: %3 = fadd <4 x float> %1, %2
|
||||||
|
; CHECK: %4 = fsub <4 x float> %1, %2
|
||||||
|
; CHECK: %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
define void @reorder_alt() #0 {
|
||||||
|
%1 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
|
||||||
|
%2 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
|
||||||
|
%3 = fadd float %1, %2
|
||||||
|
store float %3, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
|
||||||
|
%4 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
|
||||||
|
%5 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
|
||||||
|
%6 = fsub float %4, %5
|
||||||
|
store float %6, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
|
||||||
|
%7 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
|
||||||
|
%8 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
|
||||||
|
%9 = fadd float %7, %8
|
||||||
|
store float %9, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
|
||||||
|
%10 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
|
||||||
|
%11 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
|
||||||
|
%12 = fsub float %10, %11
|
||||||
|
store float %12, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Check vectorization of following code for float data type-
|
||||||
|
; fc[0] = fa[0]+(fb[0]-fd[0]);
|
||||||
|
; fc[1] = fa[1]-(fb[1]+fd[1]);
|
||||||
|
; fc[2] = fa[2]+(fb[2]-fd[2]);
|
||||||
|
; fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb
|
||||||
|
|
||||||
|
; CHECK-LABEL: @reorder_alt_subTree
|
||||||
|
; CHECK: %4 = fsub <4 x float> %3, %2
|
||||||
|
; CHECK: %5 = fadd <4 x float> %3, %2
|
||||||
|
; CHECK: %6 = shufflevector <4 x float> %4, <4 x float> %5, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
; CHECK: %7 = fadd <4 x float> %1, %6
|
||||||
|
; CHECK: %8 = fsub <4 x float> %1, %6
|
||||||
|
; CHECK: %9 = shufflevector <4 x float> %7, <4 x float> %8, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
define void @reorder_alt_subTree() #0 {
|
||||||
|
%1 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
|
||||||
|
%2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
|
||||||
|
%3 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 0), align 4
|
||||||
|
%4 = fsub float %2, %3
|
||||||
|
%5 = fadd float %1, %4
|
||||||
|
store float %5, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
|
||||||
|
%6 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
|
||||||
|
%7 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
|
||||||
|
%8 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 1), align 4
|
||||||
|
%9 = fadd float %7, %8
|
||||||
|
%10 = fsub float %6, %9
|
||||||
|
store float %10, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
|
||||||
|
%11 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
|
||||||
|
%12 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
|
||||||
|
%13 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 2), align 4
|
||||||
|
%14 = fsub float %12, %13
|
||||||
|
%15 = fadd float %11, %14
|
||||||
|
store float %15, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
|
||||||
|
%16 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
|
||||||
|
%17 = load float* getelementptr inbounds ([4 x float]* @fd, i32 0, i64 3), align 4
|
||||||
|
%18 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
|
||||||
|
%19 = fadd float %17, %18
|
||||||
|
%20 = fsub float %16, %19
|
||||||
|
store float %20, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Check vectorization of following code for double data type-
|
||||||
|
; c[0] = (a[0]+b[0])-d[0];
|
||||||
|
; c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1])
|
||||||
|
|
||||||
|
; CHECK-LABEL: @reorder_alt_rightsubTree
|
||||||
|
; CHECK: fadd <2 x double>
|
||||||
|
; CHECK: fsub <2 x double>
|
||||||
|
; CHECK: shufflevector <2 x double>
|
||||||
|
define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
|
||||||
|
%1 = load double* %a
|
||||||
|
%2 = load double* %b
|
||||||
|
%3 = fadd double %1, %2
|
||||||
|
%4 = load double* %d
|
||||||
|
%5 = fsub double %3, %4
|
||||||
|
store double %5, double* %c
|
||||||
|
%6 = getelementptr inbounds double* %d, i64 1
|
||||||
|
%7 = load double* %6
|
||||||
|
%8 = getelementptr inbounds double* %a, i64 1
|
||||||
|
%9 = load double* %8
|
||||||
|
%10 = getelementptr inbounds double* %b, i64 1
|
||||||
|
%11 = load double* %10
|
||||||
|
%12 = fadd double %9, %11
|
||||||
|
%13 = fadd double %7, %12
|
||||||
|
%14 = getelementptr inbounds double* %c, i64 1
|
||||||
|
store double %13, double* %14
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Dont vectorization of following code for float data type as sub is not commutative-
|
||||||
|
; fc[0] = fb[0]+fa[0];
|
||||||
|
; fc[1] = fa[1]-fb[1];
|
||||||
|
; fc[2] = fa[2]+fb[2];
|
||||||
|
; fc[3] = fb[3]-fa[3];
|
||||||
|
; In the above code we can swap the 1st and 2nd operation as fadd is commutative
|
||||||
|
; but not 2nd or 4th as fsub is not commutative.
|
||||||
|
|
||||||
|
; CHECK-LABEL: @no_vec_shuff_reorder
|
||||||
|
; CHECK-NOT: fadd <4 x float>
|
||||||
|
; CHECK-NOT: fsub <4 x float>
|
||||||
|
; CHECK-NOT: shufflevector
|
||||||
|
define void @no_vec_shuff_reorder() #0 {
|
||||||
|
%1 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
|
||||||
|
%2 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
|
||||||
|
%3 = fadd float %1, %2
|
||||||
|
store float %3, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
|
||||||
|
%4 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
|
||||||
|
%5 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
|
||||||
|
%6 = fsub float %4, %5
|
||||||
|
store float %6, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
|
||||||
|
%7 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
|
||||||
|
%8 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
|
||||||
|
%9 = fadd float %7, %8
|
||||||
|
store float %9, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
|
||||||
|
%10 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
|
||||||
|
%11 = load float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
|
||||||
|
%12 = fsub float %10, %11
|
||||||
|
store float %12, float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
attributes #0 = { nounwind }
|
attributes #0 = { nounwind }
|
||||||
|
|
||||||
|
@ -232,3 +232,113 @@ for.body3:
|
|||||||
for.end:
|
for.end:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; Check vectorization of following code for double data type-
|
||||||
|
; c[0] = a[0]+b[0];
|
||||||
|
; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
|
||||||
|
|
||||||
|
; CHECK-LABEL: load_reorder_double
|
||||||
|
; CHECK: load <2 x double>*
|
||||||
|
; CHECK: fadd <2 x double>
|
||||||
|
define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){
|
||||||
|
%1 = load double* %a
|
||||||
|
%2 = load double* %b
|
||||||
|
%3 = fadd double %1, %2
|
||||||
|
store double %3, double* %c
|
||||||
|
%4 = getelementptr inbounds double* %b, i64 1
|
||||||
|
%5 = load double* %4
|
||||||
|
%6 = getelementptr inbounds double* %a, i64 1
|
||||||
|
%7 = load double* %6
|
||||||
|
%8 = fadd double %5, %7
|
||||||
|
%9 = getelementptr inbounds double* %c, i64 1
|
||||||
|
store double %8, double* %9
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Check vectorization of following code for float data type-
|
||||||
|
; c[0] = a[0]+b[0];
|
||||||
|
; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
|
||||||
|
; c[2] = a[2]+b[2];
|
||||||
|
; c[3] = a[3]+b[3];
|
||||||
|
|
||||||
|
; CHECK-LABEL: load_reorder_float
|
||||||
|
; CHECK: load <4 x float>*
|
||||||
|
; CHECK: fadd <4 x float>
|
||||||
|
define void @load_reorder_float(float* nocapture %c, float* noalias nocapture readonly %a, float* noalias nocapture readonly %b){
|
||||||
|
%1 = load float* %a
|
||||||
|
%2 = load float* %b
|
||||||
|
%3 = fadd float %1, %2
|
||||||
|
store float %3, float* %c
|
||||||
|
%4 = getelementptr inbounds float* %b, i64 1
|
||||||
|
%5 = load float* %4
|
||||||
|
%6 = getelementptr inbounds float* %a, i64 1
|
||||||
|
%7 = load float* %6
|
||||||
|
%8 = fadd float %5, %7
|
||||||
|
%9 = getelementptr inbounds float* %c, i64 1
|
||||||
|
store float %8, float* %9
|
||||||
|
%10 = getelementptr inbounds float* %a, i64 2
|
||||||
|
%11 = load float* %10
|
||||||
|
%12 = getelementptr inbounds float* %b, i64 2
|
||||||
|
%13 = load float* %12
|
||||||
|
%14 = fadd float %11, %13
|
||||||
|
%15 = getelementptr inbounds float* %c, i64 2
|
||||||
|
store float %14, float* %15
|
||||||
|
%16 = getelementptr inbounds float* %a, i64 3
|
||||||
|
%17 = load float* %16
|
||||||
|
%18 = getelementptr inbounds float* %b, i64 3
|
||||||
|
%19 = load float* %18
|
||||||
|
%20 = fadd float %17, %19
|
||||||
|
%21 = getelementptr inbounds float* %c, i64 3
|
||||||
|
store float %20, float* %21
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Check we properly reorder the below code so that it gets vectorized optimally-
|
||||||
|
; a[0] = (b[0]+c[0])+d[0];
|
||||||
|
; a[1] = d[1]+(b[1]+c[1]);
|
||||||
|
; a[2] = (b[2]+c[2])+d[2];
|
||||||
|
; a[3] = (b[3]+c[3])+d[3];
|
||||||
|
|
||||||
|
; CHECK-LABEL: opcode_reorder
|
||||||
|
; CHECK: load <4 x float>*
|
||||||
|
; CHECK: fadd <4 x float>
|
||||||
|
define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocapture readonly %b,
|
||||||
|
float* noalias nocapture readonly %c,float* noalias nocapture readonly %d){
|
||||||
|
%1 = load float* %b
|
||||||
|
%2 = load float* %c
|
||||||
|
%3 = fadd float %1, %2
|
||||||
|
%4 = load float* %d
|
||||||
|
%5 = fadd float %3, %4
|
||||||
|
store float %5, float* %a
|
||||||
|
%6 = getelementptr inbounds float* %d, i64 1
|
||||||
|
%7 = load float* %6
|
||||||
|
%8 = getelementptr inbounds float* %b, i64 1
|
||||||
|
%9 = load float* %8
|
||||||
|
%10 = getelementptr inbounds float* %c, i64 1
|
||||||
|
%11 = load float* %10
|
||||||
|
%12 = fadd float %9, %11
|
||||||
|
%13 = fadd float %7, %12
|
||||||
|
%14 = getelementptr inbounds float* %a, i64 1
|
||||||
|
store float %13, float* %14
|
||||||
|
%15 = getelementptr inbounds float* %b, i64 2
|
||||||
|
%16 = load float* %15
|
||||||
|
%17 = getelementptr inbounds float* %c, i64 2
|
||||||
|
%18 = load float* %17
|
||||||
|
%19 = fadd float %16, %18
|
||||||
|
%20 = getelementptr inbounds float* %d, i64 2
|
||||||
|
%21 = load float* %20
|
||||||
|
%22 = fadd float %19, %21
|
||||||
|
%23 = getelementptr inbounds float* %a, i64 2
|
||||||
|
store float %22, float* %23
|
||||||
|
%24 = getelementptr inbounds float* %b, i64 3
|
||||||
|
%25 = load float* %24
|
||||||
|
%26 = getelementptr inbounds float* %c, i64 3
|
||||||
|
%27 = load float* %26
|
||||||
|
%28 = fadd float %25, %27
|
||||||
|
%29 = getelementptr inbounds float* %d, i64 3
|
||||||
|
%30 = load float* %29
|
||||||
|
%31 = fadd float %28, %30
|
||||||
|
%32 = getelementptr inbounds float* %a, i64 3
|
||||||
|
store float %31, float* %32
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user