mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-14 16:33:28 +00:00
add a new SimplifyDemandedVectorElts method, which works similarly to
SimplifyDemandedBits. The idea is that some operations can be simplified if not all of the computed elements are needed. Some targets (like x86) have a large number of intrinsics that operate on a single element, but pass other elts through unmodified. If those other elements are not needed, the intrinsics can be simplified to scalar operations, and insertelement ops can be removed. This turns (f.e.): ushort %Convert_sse(float %f) { %tmp = insertelement <4 x float> undef, float %f, uint 0 ; <<4 x float>> [#uses=1] %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, uint 1 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, uint 2 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, uint 3 ; <<4 x float>> [#uses=1] %tmp28 = tail call <4 x float> %llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] %tmp37 = tail call <4 x float> %llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] %tmp48 = tail call <4 x float> %llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] %tmp59 = tail call <4 x float> %llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] %tmp = tail call int %llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <int> [#uses=1] %tmp69 = cast int %tmp to ushort ; <ushort> [#uses=1] ret ushort %tmp69 } into: ushort %Convert_sse(float %f) { entry: %tmp28 = sub float %f, 1.000000e+00 ; <float> [#uses=1] %tmp37 = mul float %tmp28, 5.000000e-01 ; <float> [#uses=1] %tmp375 = insertelement <4 x float> undef, float %tmp37, uint 0 ; <<4 x float>> [#uses=1] %tmp48 = tail call <4 x float> %llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] %tmp59 = tail call <4 x float> %llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] %tmp = tail call int %llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <int> [#uses=1] %tmp69 = cast int %tmp to ushort ; <ushort> [#uses=1] ret ushort %tmp69 } which improves codegen from: _Convert_sse: movss LCPI1_0, %xmm0 movss 4(%esp), %xmm1 subss %xmm0, %xmm1 movss LCPI1_1, %xmm0 mulss %xmm0, %xmm1 movss LCPI1_2, %xmm0 minss %xmm0, %xmm1 xorps %xmm0, %xmm0 maxss %xmm0, %xmm1 cvttss2si %xmm1, %eax andl $65535, %eax ret to: _Convert_sse: movss 4(%esp), %xmm0 subss LCPI1_0, %xmm0 mulss LCPI1_1, %xmm0 movss LCPI1_2, %xmm1 minss %xmm1, %xmm0 xorps %xmm1, %xmm1 maxss %xmm1, %xmm0 cvttss2si %xmm0, %eax andl $65535, %eax ret This is just a first step, it can be extended in many ways. Testcase here: Transforms/InstCombine/vec_demanded_elts.ll git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30752 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
ce0420e65c
commit
867b99f845
@ -88,6 +88,25 @@ namespace {
|
||||
if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i)))
|
||||
WorkList.push_back(Op);
|
||||
}
|
||||
|
||||
/// AddSoonDeadInstToWorklist - The specified instruction is about to become
|
||||
/// dead. Add all of its operands to the worklist, turning them into
|
||||
/// undef's to reduce the number of uses of those instructions.
|
||||
///
|
||||
/// Return the specified operand before it is turned into an undef.
|
||||
///
|
||||
Value *AddSoonDeadInstToWorklist(Instruction &I, unsigned op) {
|
||||
Value *R = I.getOperand(op);
|
||||
|
||||
for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
|
||||
if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i))) {
|
||||
WorkList.push_back(Op);
|
||||
// Set the operand to undef to drop the use.
|
||||
I.setOperand(i, UndefValue::get(Op->getType()));
|
||||
}
|
||||
|
||||
return R;
|
||||
}
|
||||
|
||||
// removeFromWorkList - remove all instances of I from the worklist.
|
||||
void removeFromWorkList(Instruction *I);
|
||||
@ -241,6 +260,9 @@ namespace {
|
||||
uint64_t &KnownZero, uint64_t &KnownOne,
|
||||
unsigned Depth = 0);
|
||||
|
||||
Value *SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
|
||||
uint64_t &UndefElts, unsigned Depth = 0);
|
||||
|
||||
// FoldOpIntoPhi - Given a binary operator or cast instruction which has a
|
||||
// PHI node as operand #0, see if we can fold the instruction into the PHI
|
||||
// (which is only possible if all operands to the PHI are constants).
|
||||
@ -1173,6 +1195,208 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t DemandedMask,
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/// SimplifyDemandedVectorElts - The specified value producecs a vector with
|
||||
/// 64 or fewer elements. DemandedElts contains the set of elements that are
|
||||
/// actually used by the caller. This method analyzes which elements of the
|
||||
/// operand are undef and returns that information in UndefElts.
|
||||
///
|
||||
/// If the information about demanded elements can be used to simplify the
|
||||
/// operation, the operation is simplified, then the resultant value is
|
||||
/// returned. This returns null if no change was made.
|
||||
Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
|
||||
uint64_t &UndefElts,
|
||||
unsigned Depth) {
|
||||
unsigned VWidth = cast<PackedType>(V->getType())->getNumElements();
|
||||
assert(VWidth <= 64 && "Vector too wide to analyze!");
|
||||
uint64_t EltMask = ~0ULL >> (64-VWidth);
|
||||
assert(DemandedElts != EltMask && (DemandedElts & ~EltMask) == 0 &&
|
||||
"Invalid DemandedElts!");
|
||||
|
||||
if (isa<UndefValue>(V)) {
|
||||
// If the entire vector is undefined, just return this info.
|
||||
UndefElts = EltMask;
|
||||
return 0;
|
||||
} else if (DemandedElts == 0) { // If nothing is demanded, provide undef.
|
||||
UndefElts = EltMask;
|
||||
return UndefValue::get(V->getType());
|
||||
}
|
||||
|
||||
UndefElts = 0;
|
||||
if (ConstantPacked *CP = dyn_cast<ConstantPacked>(V)) {
|
||||
const Type *EltTy = cast<PackedType>(V->getType())->getElementType();
|
||||
Constant *Undef = UndefValue::get(EltTy);
|
||||
|
||||
std::vector<Constant*> Elts;
|
||||
for (unsigned i = 0; i != VWidth; ++i)
|
||||
if (!(DemandedElts & (1ULL << i))) { // If not demanded, set to undef.
|
||||
Elts.push_back(Undef);
|
||||
UndefElts |= (1ULL << i);
|
||||
} else if (isa<UndefValue>(CP->getOperand(i))) { // Already undef.
|
||||
Elts.push_back(Undef);
|
||||
UndefElts |= (1ULL << i);
|
||||
} else { // Otherwise, defined.
|
||||
Elts.push_back(CP->getOperand(i));
|
||||
}
|
||||
|
||||
// If we changed the constant, return it.
|
||||
Constant *NewCP = ConstantPacked::get(Elts);
|
||||
return NewCP != CP ? NewCP : 0;
|
||||
} else if (isa<ConstantAggregateZero>(V)) {
|
||||
// Simplify the CAZ to a ConstantPacked where the non-demanded elements are
|
||||
// set to undef.
|
||||
const Type *EltTy = cast<PackedType>(V->getType())->getElementType();
|
||||
Constant *Zero = Constant::getNullValue(EltTy);
|
||||
Constant *Undef = UndefValue::get(EltTy);
|
||||
std::vector<Constant*> Elts;
|
||||
for (unsigned i = 0; i != VWidth; ++i)
|
||||
Elts.push_back((DemandedElts & (1ULL << i)) ? Zero : Undef);
|
||||
UndefElts = DemandedElts ^ EltMask;
|
||||
return ConstantPacked::get(Elts);
|
||||
}
|
||||
|
||||
if (!V->hasOneUse()) { // Other users may use these bits.
|
||||
if (Depth != 0) { // Not at the root.
|
||||
// TODO: Just compute the UndefElts information recursively.
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
} else if (Depth == 10) { // Limit search depth.
|
||||
return false;
|
||||
}
|
||||
|
||||
Instruction *I = dyn_cast<Instruction>(V);
|
||||
if (!I) return false; // Only analyze instructions.
|
||||
|
||||
bool MadeChange = false;
|
||||
uint64_t UndefElts2;
|
||||
Value *TmpV;
|
||||
switch (I->getOpcode()) {
|
||||
default: break;
|
||||
|
||||
case Instruction::InsertElement: {
|
||||
// If this is a variable index, we don't know which element it overwrites.
|
||||
// demand exactly the same input as we produce.
|
||||
ConstantUInt *Idx = dyn_cast<ConstantUInt>(I->getOperand(2));
|
||||
if (Idx == 0) {
|
||||
// Note that we can't propagate undef elt info, because we don't know
|
||||
// which elt is getting updated.
|
||||
TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
|
||||
UndefElts2, Depth+1);
|
||||
if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
|
||||
break;
|
||||
}
|
||||
|
||||
// If this is inserting an element that isn't demanded, remove this
|
||||
// insertelement.
|
||||
unsigned IdxNo = Idx->getValue();
|
||||
if (IdxNo >= VWidth || (DemandedElts & (1ULL << IdxNo)) == 0)
|
||||
return AddSoonDeadInstToWorklist(*I, 0);
|
||||
|
||||
// Otherwise, the element inserted overwrites whatever was there, so the
|
||||
// input demanded set is simpler than the output set.
|
||||
TmpV = SimplifyDemandedVectorElts(I->getOperand(0),
|
||||
DemandedElts & ~(1ULL << IdxNo),
|
||||
UndefElts, Depth+1);
|
||||
if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
|
||||
|
||||
// The inserted element is defined.
|
||||
UndefElts |= 1ULL << IdxNo;
|
||||
break;
|
||||
}
|
||||
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor:
|
||||
case Instruction::Add:
|
||||
case Instruction::Sub:
|
||||
case Instruction::Mul:
|
||||
// div/rem demand all inputs, because they don't want divide by zero.
|
||||
TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
|
||||
UndefElts, Depth+1);
|
||||
if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
|
||||
TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
|
||||
UndefElts2, Depth+1);
|
||||
if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
|
||||
|
||||
// Output elements are undefined if both are undefined. Consider things
|
||||
// like undef&0. The result is known zero, not undef.
|
||||
UndefElts &= UndefElts2;
|
||||
break;
|
||||
|
||||
case Instruction::Call: {
|
||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
|
||||
if (!II) break;
|
||||
switch (II->getIntrinsicID()) {
|
||||
default: break;
|
||||
|
||||
// Binary vector operations that work column-wise. A dest element is a
|
||||
// function of the corresponding input elements from the two inputs.
|
||||
case Intrinsic::x86_sse_sub_ss:
|
||||
case Intrinsic::x86_sse_mul_ss:
|
||||
case Intrinsic::x86_sse_min_ss:
|
||||
case Intrinsic::x86_sse_max_ss:
|
||||
case Intrinsic::x86_sse2_sub_sd:
|
||||
case Intrinsic::x86_sse2_mul_sd:
|
||||
case Intrinsic::x86_sse2_min_sd:
|
||||
case Intrinsic::x86_sse2_max_sd:
|
||||
TmpV = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
|
||||
UndefElts, Depth+1);
|
||||
if (TmpV) { II->setOperand(1, TmpV); MadeChange = true; }
|
||||
TmpV = SimplifyDemandedVectorElts(II->getOperand(2), DemandedElts,
|
||||
UndefElts2, Depth+1);
|
||||
if (TmpV) { II->setOperand(2, TmpV); MadeChange = true; }
|
||||
|
||||
// If only the low elt is demanded and this is a scalarizable intrinsic,
|
||||
// scalarize it now.
|
||||
if (DemandedElts == 1) {
|
||||
switch (II->getIntrinsicID()) {
|
||||
default: break;
|
||||
case Intrinsic::x86_sse_sub_ss:
|
||||
case Intrinsic::x86_sse_mul_ss:
|
||||
case Intrinsic::x86_sse2_sub_sd:
|
||||
case Intrinsic::x86_sse2_mul_sd:
|
||||
// TODO: Lower MIN/MAX/ABS/etc
|
||||
Value *LHS = II->getOperand(1);
|
||||
Value *RHS = II->getOperand(2);
|
||||
// Extract the element as scalars.
|
||||
LHS = InsertNewInstBefore(new ExtractElementInst(LHS, 0U,"tmp"), *II);
|
||||
RHS = InsertNewInstBefore(new ExtractElementInst(RHS, 0U,"tmp"), *II);
|
||||
|
||||
switch (II->getIntrinsicID()) {
|
||||
default: assert(0 && "Case stmts out of sync!");
|
||||
case Intrinsic::x86_sse_sub_ss:
|
||||
case Intrinsic::x86_sse2_sub_sd:
|
||||
TmpV = InsertNewInstBefore(BinaryOperator::createSub(LHS, RHS,
|
||||
II->getName()), *II);
|
||||
break;
|
||||
case Intrinsic::x86_sse_mul_ss:
|
||||
case Intrinsic::x86_sse2_mul_sd:
|
||||
TmpV = InsertNewInstBefore(BinaryOperator::createMul(LHS, RHS,
|
||||
II->getName()), *II);
|
||||
break;
|
||||
}
|
||||
|
||||
Instruction *New =
|
||||
new InsertElementInst(UndefValue::get(II->getType()), TmpV, 0U,
|
||||
II->getName());
|
||||
InsertNewInstBefore(New, *II);
|
||||
AddSoonDeadInstToWorklist(*II, 0);
|
||||
return New;
|
||||
}
|
||||
}
|
||||
|
||||
// Output elements are undefined if both are undefined. Consider things
|
||||
// like undef&0. The result is known zero, not undef.
|
||||
UndefElts &= UndefElts2;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return MadeChange ? I : 0;
|
||||
}
|
||||
|
||||
// isTrueWhenEqual - Return true if the specified setcondinst instruction is
|
||||
// true when both operands are equal...
|
||||
//
|
||||
@ -6088,6 +6312,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
return new StoreInst(II->getOperand(2), Ptr);
|
||||
}
|
||||
break;
|
||||
|
||||
case Intrinsic::x86_sse_cvttss2si: {
|
||||
// These intrinsics only demands the 0th element of its input vector. If
|
||||
// we can simplify the input based on that, do so now.
|
||||
uint64_t UndefElts;
|
||||
if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1,
|
||||
UndefElts)) {
|
||||
II->setOperand(1, V);
|
||||
return II;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::ppc_altivec_vperm:
|
||||
// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
|
||||
if (ConstantPacked *Mask = dyn_cast<ConstantPacked>(II->getOperand(3))) {
|
||||
@ -6121,17 +6358,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
|
||||
if (ExtractedElts[Idx] == 0) {
|
||||
Instruction *Elt =
|
||||
new ExtractElementInst(Idx < 16 ? Op0 : Op1,
|
||||
ConstantUInt::get(Type::UIntTy, Idx&15),
|
||||
"tmp");
|
||||
new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
|
||||
InsertNewInstBefore(Elt, CI);
|
||||
ExtractedElts[Idx] = Elt;
|
||||
}
|
||||
|
||||
// Insert this value into the result vector.
|
||||
Result = new InsertElementInst(Result, ExtractedElts[Idx],
|
||||
ConstantUInt::get(Type::UIntTy, i),
|
||||
"tmp");
|
||||
Result = new InsertElementInst(Result, ExtractedElts[Idx], i,"tmp");
|
||||
InsertNewInstBefore(cast<Instruction>(Result), CI);
|
||||
}
|
||||
return new CastInst(Result, CI.getType());
|
||||
@ -7512,6 +7745,19 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
|
||||
// If extracting a specified index from the vector, see if we can recursively
|
||||
// find a previously computed scalar that was inserted into the vector.
|
||||
if (ConstantUInt *IdxC = dyn_cast<ConstantUInt>(EI.getOperand(1))) {
|
||||
// This instruction only demands the single element from the input vector.
|
||||
// If the input vector has a single use, simplify it based on this use
|
||||
// property.
|
||||
if (EI.getOperand(0)->hasOneUse()) {
|
||||
uint64_t UndefElts;
|
||||
if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
|
||||
1 << IdxC->getValue(),
|
||||
UndefElts)) {
|
||||
EI.setOperand(0, V);
|
||||
return &EI;
|
||||
}
|
||||
}
|
||||
|
||||
if (Value *Elt = FindScalarElement(EI.getOperand(0), IdxC->getValue()))
|
||||
return ReplaceInstUsesWith(EI, Elt);
|
||||
}
|
||||
@ -7569,8 +7815,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
|
||||
} else {
|
||||
return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
|
||||
}
|
||||
return new ExtractElementInst(Src,
|
||||
ConstantUInt::get(Type::UIntTy, SrcIdx));
|
||||
return new ExtractElementInst(Src, SrcIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -7782,6 +8027,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
|
||||
|
||||
bool MadeChange = false;
|
||||
|
||||
// Undefined shuffle mask -> undefined value.
|
||||
if (isa<UndefValue>(SVI.getOperand(2)))
|
||||
return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user