mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-02-12 18:33:22 +00:00
Add an optimization that looks for a specific pair-wise add pattern and generates a vpaddl instruction instead of scalarizing the add.
Includes a test case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@133027 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
a6ad823675
commit
189531f317
@ -5523,12 +5523,112 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
|
||||
// (only after legalization).
|
||||
static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
|
||||
// Only perform optimization if after legalize, and if NEON is available. We
|
||||
// also expected both operands to be BUILD_VECTORs.
|
||||
if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
|
||||
|| N0.getOpcode() != ISD::BUILD_VECTOR
|
||||
|| N1.getOpcode() != ISD::BUILD_VECTOR)
|
||||
return SDValue();
|
||||
|
||||
// Check output type since VPADDL operand elements can only be 8, 16, or 32.
|
||||
EVT VT = N->getValueType(0);
|
||||
if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
|
||||
return SDValue();
|
||||
|
||||
// Check that the vector operands are of the right form.
|
||||
// N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
|
||||
// operands, where N is the size of the formed vector.
|
||||
// Each EXTRACT_VECTOR should have the same input vector and odd or even
|
||||
// index such that we have a pair wise add pattern.
|
||||
SDNode *V = 0;
|
||||
SDValue Vec;
|
||||
unsigned nextIndex = 0;
|
||||
|
||||
// Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
|
||||
if (N0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
|
||||
Vec = N0->getOperand(0)->getOperand(0);
|
||||
V = Vec.getNode();
|
||||
} else
|
||||
return SDValue();
|
||||
|
||||
// For each operands to the ADD which are BUILD_VECTORs,
|
||||
// check to see if each of their operands are an EXTRACT_VECTOR with
|
||||
// the same vector and appropriate index.
|
||||
for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
|
||||
if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
|
||||
&& N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
|
||||
|
||||
SDValue ExtVec0 = N0->getOperand(i);
|
||||
SDValue ExtVec1 = N1->getOperand(i);
|
||||
|
||||
// First operand is the vector, verify its the same.
|
||||
if (V != ExtVec0->getOperand(0).getNode() ||
|
||||
V != ExtVec1->getOperand(0).getNode())
|
||||
return SDValue();
|
||||
|
||||
// Second is the constant, verify its correct.
|
||||
ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
|
||||
ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
|
||||
|
||||
// For the constant, we want to see all the even or all the odd.
|
||||
if (!C0 || !C1 || C0->getZExtValue() != nextIndex
|
||||
|| C1->getZExtValue() != nextIndex+1)
|
||||
return SDValue();
|
||||
|
||||
// Increment index.
|
||||
nextIndex+=2;
|
||||
} else
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Create VPADDL node.
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
DebugLoc DL = N->getDebugLoc();
|
||||
|
||||
// Build operand list.
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
|
||||
TLI.getPointerTy()));
|
||||
|
||||
// Input is the vector.
|
||||
Ops.push_back(Vec);
|
||||
|
||||
// Get widened type and narrowed type.
|
||||
MVT widenType;
|
||||
unsigned numElem = VT.getVectorNumElements();
|
||||
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
|
||||
case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
|
||||
case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
|
||||
case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
|
||||
default:
|
||||
assert(0 && "Invalid vector element type for padd optimization.");
|
||||
}
|
||||
|
||||
SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
|
||||
widenType, &Ops[0], Ops.size());
|
||||
return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
|
||||
}
|
||||
|
||||
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
|
||||
/// operands N0 and N1. This is a helper for PerformADDCombine that is
|
||||
/// called with the default operands, and if that fails, with commuted
|
||||
/// operands.
|
||||
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget){
|
||||
|
||||
// Attempt to create vpaddl for this add.
|
||||
SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
|
||||
if (Result.getNode())
|
||||
return Result;
|
||||
|
||||
// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
|
||||
if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
|
||||
SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
|
||||
@ -5540,17 +5640,18 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
|
||||
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
|
||||
///
|
||||
static SDValue PerformADDCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
|
||||
// First try with the default operand order.
|
||||
SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI);
|
||||
SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
|
||||
if (Result.getNode())
|
||||
return Result;
|
||||
|
||||
// If that didn't work, try again with the operands commuted.
|
||||
return PerformADDCombineWithOperands(N, N1, N0, DCI);
|
||||
return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
|
||||
}
|
||||
|
||||
/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
|
||||
@ -6755,7 +6856,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
switch (N->getOpcode()) {
|
||||
default: break;
|
||||
case ISD::ADD: return PerformADDCombine(N, DCI);
|
||||
case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
|
||||
case ISD::SUB: return PerformSUBCombine(N, DCI);
|
||||
case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
|
||||
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
|
||||
|
@ -138,6 +138,20 @@ define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
|
||||
ret <2 x i64> %tmp2
|
||||
}
|
||||
|
||||
; Test AddCombine optimization that generates a vpaddl.s
|
||||
define void @addCombineToVPADDL() nounwind ssp {
|
||||
; CHECK: vpaddl.s8
|
||||
%cbcr = alloca <16 x i8>, align 16
|
||||
%X = alloca <8 x i8>, align 8
|
||||
%tmp = load <16 x i8>* %cbcr
|
||||
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
||||
%tmp2 = load <16 x i8>* %cbcr
|
||||
%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
||||
%add = add <8 x i8> %tmp3, %tmp1
|
||||
store <8 x i8> %add, <8 x i8>* %X, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
|
||||
declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
|
||||
declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
|
||||
|
Loading…
x
Reference in New Issue
Block a user