mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-04-05 17:39:16 +00:00
Generate a VTBL instruction instead of a series of loads and stores when we
can. As Nate pointed out, VTBL isn't super performant, but it *has* to be better than this: _shuf: @ BB#0: @ %entry push {r4, r7, lr} add r7, sp, #4 sub sp, #12 mov r4, sp bic r4, r4, #7 mov sp, r4 mov r2, sp vmov d16, r0, r1 orr r0, r2, #6 orr r3, r2, #7 vst1.8 {d16[0]}, [r3] vst1.8 {d16[5]}, [r0] subs r4, r7, #4 orr r0, r2, #5 vst1.8 {d16[4]}, [r0] orr r0, r2, #4 vst1.8 {d16[4]}, [r0] orr r0, r2, #3 vst1.8 {d16[0]}, [r0] orr r0, r2, #2 vst1.8 {d16[2]}, [r0] orr r0, r2, #1 vst1.8 {d16[1]}, [r0] vst1.8 {d16[3]}, [r2] vldr.64 d16, [sp] vmov r0, r1, d16 mov sp, r4 pop {r4, r7, pc} The "illegal" testcase in vext.ll is no longer illegal. <rdar://problem/9078775> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127630 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
b121bfcc22
commit
69a05a7b92
@ -2842,6 +2842,35 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
|
||||
break;
|
||||
}
|
||||
|
||||
case ARMISD::VTBL1: {
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
EVT VT = N->getValueType(0);
|
||||
SmallVector<SDValue, 6> Ops;
|
||||
|
||||
Ops.push_back(N->getOperand(0));
|
||||
Ops.push_back(N->getOperand(1));
|
||||
Ops.push_back(getAL(CurDAG)); // Predicate
|
||||
Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
|
||||
return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size());
|
||||
}
|
||||
case ARMISD::VTBL2: {
|
||||
DebugLoc dl = N->getDebugLoc();
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
// Form a REG_SEQUENCE to force register allocation.
|
||||
SDValue V0 = N->getOperand(0);
|
||||
SDValue V1 = N->getOperand(1);
|
||||
SDValue RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0);
|
||||
|
||||
SmallVector<SDValue, 6> Ops;
|
||||
Ops.push_back(RegSeq);
|
||||
Ops.push_back(N->getOperand(2));
|
||||
Ops.push_back(getAL(CurDAG)); // Predicate
|
||||
Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
|
||||
return CurDAG->getMachineNode(ARM::VTBL2Pseudo, dl, VT,
|
||||
Ops.data(), Ops.size());
|
||||
}
|
||||
|
||||
case ISD::CONCAT_VECTORS:
|
||||
return SelectConcatVector(N);
|
||||
}
|
||||
|
@ -852,6 +852,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
case ARMISD::VZIP: return "ARMISD::VZIP";
|
||||
case ARMISD::VUZP: return "ARMISD::VUZP";
|
||||
case ARMISD::VTRN: return "ARMISD::VTRN";
|
||||
case ARMISD::VTBL1: return "ARMISD::VTBL1";
|
||||
case ARMISD::VTBL2: return "ARMISD::VTBL2";
|
||||
case ARMISD::VTBL3: return "ARMISD::VTBL3";
|
||||
case ARMISD::VTBL4: return "ARMISD::VTBL4";
|
||||
case ARMISD::VMULLs: return "ARMISD::VMULLs";
|
||||
case ARMISD::VMULLu: return "ARMISD::VMULLu";
|
||||
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
|
||||
@ -4055,6 +4059,29 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
|
||||
}
|
||||
}
|
||||
|
||||
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
|
||||
SmallVectorImpl<int> &ShuffleMask,
|
||||
SelectionDAG &DAG) {
|
||||
// Check to see if we can use the VTBL instruction.
|
||||
SDValue V1 = Op.getOperand(0);
|
||||
SDValue V2 = Op.getOperand(1);
|
||||
DebugLoc DL = Op.getDebugLoc();
|
||||
|
||||
SmallVector<SDValue, 8> VTBLMask;
|
||||
for (SmallVectorImpl<int>::iterator
|
||||
I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
|
||||
VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
|
||||
|
||||
if (V2.getNode()->getOpcode() == ISD::UNDEF)
|
||||
return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
|
||||
&VTBLMask[0], 8));
|
||||
else
|
||||
return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
|
||||
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
|
||||
&VTBLMask[0], 8));
|
||||
}
|
||||
|
||||
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
|
||||
SDValue V1 = Op.getOperand(0);
|
||||
SDValue V2 = Op.getOperand(1);
|
||||
@ -4172,6 +4199,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
|
||||
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
|
||||
}
|
||||
|
||||
if (VT == MVT::v8i8) {
|
||||
SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
|
||||
if (NewOp.getNode())
|
||||
return NewOp;
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -4534,7 +4567,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::GlobalAddress:
|
||||
return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
|
||||
LowerGlobalAddressELF(Op, DAG);
|
||||
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
|
||||
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
|
||||
case ISD::SELECT: return LowerSELECT(Op, DAG);
|
||||
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
||||
case ISD::BR_CC: return LowerBR_CC(Op, DAG);
|
||||
|
@ -153,6 +153,10 @@ namespace llvm {
|
||||
VZIP, // zip (interleave)
|
||||
VUZP, // unzip (deinterleave)
|
||||
VTRN, // transpose
|
||||
VTBL1, // 1-register shuffle with mask
|
||||
VTBL2, // 2-register shuffle with mask
|
||||
VTBL3, // 3-register shuffle with mask
|
||||
VTBL4, // 4-register shuffle with mask
|
||||
|
||||
// Vector multiply long:
|
||||
VMULLs, // ...signed
|
||||
|
@ -121,15 +121,3 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
|
||||
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
ret <4 x i16> %tmp2
|
||||
}
|
||||
|
||||
; The actual shuffle code only handles some cases, make sure we check
|
||||
; this rather than blindly emitting a VECTOR_SHUFFLE (infinite
|
||||
; lowering loop can result otherwise).
|
||||
define <8 x i8> @test_illegal(<16 x i8>* %A, <16 x i8>* %B) nounwind {
|
||||
;CHECK: test_illegal:
|
||||
;CHECK: vst1.8
|
||||
%tmp1 = load <16 x i8>* %A
|
||||
%tmp2 = load <16 x i8>* %B
|
||||
%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <8 x i32> <i32 0, i32 7, i32 5, i32 25, i32 3, i32 2, i32 2, i32 26>
|
||||
ret <8 x i8> %tmp3
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user