Start converting NEON load/stores to use pseudo instructions, beginning here

with the VST4 instructions.  Until after register allocation, we want to
represent sets of adjacent registers by a single super-register.  These
VST4 pseudo instructions have a single QQ or QQQQ source register operand.
They get expanded to the real VST4 instructions with 4 separate D register
operands.  Once this conversion is complete, we'll be able to remove the
NEONPreAllocPass and avoid some fragile and hacky code elsewhere.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112108 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bob Wilson 2010-08-25 23:27:42 +00:00
parent 5b5f7260a0
commit 709d59255a
5 changed files with 166 additions and 42 deletions

View File

@ -24,6 +24,13 @@ using namespace llvm;
namespace {
class ARMExpandPseudo : public MachineFunctionPass {
// Constants for register spacing in NEON load/store instructions.
enum NEONRegSpacing {
SingleSpc,
EvenDblSpc,
OddDblSpc
};
public:
static char ID;
ARMExpandPseudo() : MachineFunctionPass(ID) {}
@ -41,6 +48,8 @@ namespace {
void TransferImpOps(MachineInstr &OldMI,
MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
bool ExpandMBB(MachineBasicBlock &MBB);
void ExpandVST4(MachineBasicBlock::iterator &MBBI, unsigned Opc,
bool hasWriteBack, NEONRegSpacing RegSpc);
};
char ARMExpandPseudo::ID = 0;
}
@ -63,6 +72,61 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
}
}
/// ExpandVST4 - Translate VST4 pseudo instructions with QQ or QQQQ register
/// operands to real VST4 instructions with 4 D register operands.
void ARMExpandPseudo::ExpandVST4(MachineBasicBlock::iterator &MBBI,
unsigned Opc, bool hasWriteBack,
NEONRegSpacing RegSpc) {
MachineInstr &MI = *MBBI;
MachineBasicBlock &MBB = *MI.getParent();
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
unsigned OpIdx = 0;
if (hasWriteBack) {
bool DstIsDead = MI.getOperand(OpIdx).isDead();
unsigned DstReg = MI.getOperand(OpIdx++).getReg();
MIB.addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead));
}
// Copy the addrmode6 operands.
bool AddrIsKill = MI.getOperand(OpIdx).isKill();
MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill));
MIB.addImm(MI.getOperand(OpIdx++).getImm());
if (hasWriteBack) {
// Copy the am6offset operand.
bool OffsetIsKill = MI.getOperand(OpIdx).isKill();
MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill));
}
bool SrcIsKill = MI.getOperand(OpIdx).isKill();
unsigned SrcReg = MI.getOperand(OpIdx).getReg();
unsigned D0, D1, D2, D3;
if (RegSpc == SingleSpc) {
D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
D2 = TRI->getSubReg(SrcReg, ARM::dsub_2);
D3 = TRI->getSubReg(SrcReg, ARM::dsub_3);
} else if (RegSpc == EvenDblSpc) {
D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
D1 = TRI->getSubReg(SrcReg, ARM::dsub_2);
D2 = TRI->getSubReg(SrcReg, ARM::dsub_4);
D3 = TRI->getSubReg(SrcReg, ARM::dsub_6);
} else {
assert(RegSpc == OddDblSpc && "unknown register spacing for VST4");
D0 = TRI->getSubReg(SrcReg, ARM::dsub_1);
D1 = TRI->getSubReg(SrcReg, ARM::dsub_3);
D2 = TRI->getSubReg(SrcReg, ARM::dsub_5);
D3 = TRI->getSubReg(SrcReg, ARM::dsub_7);
}
MIB.addReg(D0, getKillRegState(SrcIsKill))
.addReg(D1, getKillRegState(SrcIsKill))
.addReg(D2, getKillRegState(SrcIsKill))
.addReg(D3, getKillRegState(SrcIsKill));
MIB = AddDefaultPred(MIB);
TransferImpOps(MI, MIB, MIB);
MI.eraseFromParent();
}
bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
bool Modified = false;
@ -71,9 +135,13 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
MachineInstr &MI = *MBBI;
MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
bool ModifiedOp = true;
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
default: break;
default:
ModifiedOp = false;
break;
case ARM::tLDRpci_pic:
case ARM::t2LDRpci_pic: {
unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
@ -92,7 +160,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
.addOperand(MI.getOperand(2));
TransferImpOps(MI, MIB1, MIB2);
MI.eraseFromParent();
Modified = true;
break;
}
@ -128,7 +195,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
HI16.addImm(Pred).addReg(PredReg);
TransferImpOps(MI, LO16, HI16);
MI.eraseFromParent();
Modified = true;
break;
}
@ -155,9 +221,37 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
.addReg(OddSrc, getKillRegState(SrcIsKill)));
TransferImpOps(MI, Even, Odd);
MI.eraseFromParent();
}
case ARM::VST4d8Pseudo:
ExpandVST4(MBBI, ARM::VST4d8, false, SingleSpc); break;
case ARM::VST4d16Pseudo:
ExpandVST4(MBBI, ARM::VST4d16, false, SingleSpc); break;
case ARM::VST4d32Pseudo:
ExpandVST4(MBBI, ARM::VST4d32, false, SingleSpc); break;
case ARM::VST4d8Pseudo_UPD:
ExpandVST4(MBBI, ARM::VST4d8_UPD, true, SingleSpc); break;
case ARM::VST4d16Pseudo_UPD:
ExpandVST4(MBBI, ARM::VST4d16_UPD, true, SingleSpc); break;
case ARM::VST4d32Pseudo_UPD:
ExpandVST4(MBBI, ARM::VST4d32_UPD, true, SingleSpc); break;
case ARM::VST4q8Pseudo_UPD:
ExpandVST4(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc); break;
case ARM::VST4q16Pseudo_UPD:
ExpandVST4(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc); break;
case ARM::VST4q32Pseudo_UPD:
ExpandVST4(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc); break;
case ARM::VST4q8oddPseudo_UPD:
ExpandVST4(MBBI, ARM::VST4q8_UPD, true, OddDblSpc); break;
case ARM::VST4q16oddPseudo_UPD:
ExpandVST4(MBBI, ARM::VST4q16_UPD, true, OddDblSpc); break;
case ARM::VST4q32oddPseudo_UPD:
ExpandVST4(MBBI, ARM::VST4q32_UPD, true, OddDblSpc); break;
break;
}
if (ModifiedOp)
Modified = true;
}
}
MBBI = NMBBI;
}

View File

@ -1260,6 +1260,11 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
Ops.push_back(MemAddr);
Ops.push_back(Align);
// FIXME: This is a temporary flag to distinguish VSTs that have been
// converted to pseudo instructions.
bool usePseudoInstrs = (NumVecs == 4 &&
VT.getSimpleVT().SimpleTy != MVT::v1i64);
if (is64BitVector) {
if (NumVecs >= 2) {
SDValue RegSeq;
@ -1278,6 +1283,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
: N->getOperand(3+3);
RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
}
if (usePseudoInstrs)
Ops.push_back(RegSeq);
else {
// Now extract the D registers back out.
Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT,
@ -1290,15 +1298,16 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
if (NumVecs > 3)
Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,
RegSeq));
}
} else {
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
Ops.push_back(N->getOperand(Vec+3));
Ops.push_back(N->getOperand(3));
}
Ops.push_back(Pred);
Ops.push_back(Reg0); // predicate register
Ops.push_back(Chain);
unsigned Opc = DOpcodes[OpcodeIndex];
return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5);
return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(),
usePseudoInstrs ? 6 : NumVecs+5);
}
EVT RegVT = GetNEONSubregVT(VT);
@ -1363,6 +1372,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
// Store the even D registers.
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
Ops.push_back(Reg0); // post-access address offset
if (usePseudoInstrs)
Ops.push_back(RegSeq);
else
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
RegVT, RegSeq));
@ -1371,18 +1383,24 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
Ops.push_back(Chain);
unsigned Opc = QOpcodes0[OpcodeIndex];
SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
MVT::Other, Ops.data(), NumVecs+6);
MVT::Other, Ops.data(),
usePseudoInstrs ? 7 : NumVecs+6);
Chain = SDValue(VStA, 1);
// Store the odd D registers.
Ops[0] = SDValue(VStA, 0); // MemAddr
if (usePseudoInstrs)
Ops[6] = Chain;
else {
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
RegVT, RegSeq);
Ops[NumVecs+5] = Chain;
}
Opc = QOpcodes1[OpcodeIndex];
SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
MVT::Other, Ops.data(), NumVecs+6);
MVT::Other, Ops.data(),
usePseudoInstrs ? 7 : NumVecs+6);
Chain = SDValue(VStB, 1);
ReplaceUses(SDValue(N, 0), Chain);
return NULL;
@ -2312,14 +2330,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
}
case Intrinsic::arm_neon_vst4: {
unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16,
ARM::VST4d32, ARM::VST1d64Q };
unsigned QOpcodes0[] = { ARM::VST4q8_UPD,
ARM::VST4q16_UPD,
ARM::VST4q32_UPD };
unsigned QOpcodes1[] = { ARM::VST4q8odd_UPD,
ARM::VST4q16odd_UPD,
ARM::VST4q32odd_UPD };
unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo,
ARM::VST4d32Pseudo, ARM::VST1d64Q };
unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
ARM::VST4q16Pseudo_UPD,
ARM::VST4q32Pseudo_UPD };
unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
ARM::VST4q16oddPseudo_UPD,
ARM::VST4q32oddPseudo_UPD };
return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
}

View File

@ -1534,6 +1534,14 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
let Inst{7-4} = op7_4;
}
class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
: InstARM<AddrMode6, Size4Bytes, IndexModeNone, Pseudo, NeonDomain, cstr,
itin> {
let OutOperandList = oops;
let InOperandList = !con(iops, (ins pred:$p));
list<Predicate> Predicates = [HasNEON];
}
class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
string opc, string dt, string asm, string cstr, list<dag> pattern>
: NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,

View File

@ -486,6 +486,19 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">;
let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
// Classes for VST* pseudo-instructions with multi-register operands.
// These are expanded to real instructions after register allocation.
class VSTQQPseudo
: PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), IIC_VST, "">;
class VSTQQWBPseudo
: PseudoNLdSt<(outs GPR:$wb),
(ins addrmode6:$addr, am6offset:$offset, QQPR:$src), IIC_VST,
"$addr.addr = $wb">;
class VSTQQQQWBPseudo
: PseudoNLdSt<(outs GPR:$wb),
(ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST,
"$addr.addr = $wb">;
// VST1 : Vector Store (multiple single elements)
class VST1D<bits<4> op7_4, string Dt>
: NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
@ -664,6 +677,10 @@ def VST4d8 : VST4D<0b0000, 0b0000, "8">;
def VST4d16 : VST4D<0b0000, 0b0100, "16">;
def VST4d32 : VST4D<0b0000, 0b1000, "32">;
def VST4d8Pseudo : VSTQQPseudo;
def VST4d16Pseudo : VSTQQPseudo;
def VST4d32Pseudo : VSTQQPseudo;
// ...with address register writeback:
class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
@ -676,6 +693,10 @@ def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">;
def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">;
def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">;
def VST4d8Pseudo_UPD : VSTQQWBPseudo;
def VST4d16Pseudo_UPD : VSTQQWBPseudo;
def VST4d32Pseudo_UPD : VSTQQWBPseudo;
// ...with double-spaced registers (non-updating versions for disassembly only):
def VST4q8 : VST4D<0b0001, 0b0000, "8">;
def VST4q16 : VST4D<0b0001, 0b0100, "16">;
@ -684,10 +705,14 @@ def VST4q8_UPD : VST4DWB<0b0001, 0b0000, "8">;
def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">;
def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">;
def VST4q8Pseudo_UPD : VSTQQQQWBPseudo;
def VST4q16Pseudo_UPD : VSTQQQQWBPseudo;
def VST4q32Pseudo_UPD : VSTQQQQWBPseudo;
// ...alternate versions to be allocated odd register numbers:
def VST4q8odd_UPD : VST4DWB<0b0001, 0b0000, "8">;
def VST4q16odd_UPD : VST4DWB<0b0001, 0b0100, "16">;
def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">;
def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo;
def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo;
def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo;
// VST1LN : Vector Store (single element from one lane)
// FIXME: Not yet implemented.

View File

@ -260,9 +260,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
Stride = 2;
return true;
case ARM::VST4d8:
case ARM::VST4d16:
case ARM::VST4d32:
case ARM::VST1d64Q:
case ARM::VST4LNd8:
case ARM::VST4LNd16:
@ -271,24 +268,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
NumRegs = 4;
return true;
case ARM::VST4q8_UPD:
case ARM::VST4q16_UPD:
case ARM::VST4q32_UPD:
FirstOpnd = 4;
NumRegs = 4;
Offset = 0;
Stride = 2;
return true;
case ARM::VST4q8odd_UPD:
case ARM::VST4q16odd_UPD:
case ARM::VST4q32odd_UPD:
FirstOpnd = 4;
NumRegs = 4;
Offset = 1;
Stride = 2;
return true;
case ARM::VST4LNq16:
case ARM::VST4LNq32:
FirstOpnd = 2;