[mips][msa] Implemented extract_vector_elt for v4f32 or v2f64

For v4f32 and v2f64, EXTRACT_VECTOR_ELT is matched by a pseudo-insn which may
be expanded to subregister copies and/or instructions as appropriate.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191514 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Daniel Sanders 2013-09-27 12:17:32 +00:00
parent 4cc117883d
commit b4691b495d
5 changed files with 325 additions and 4 deletions

View File

@ -1076,6 +1076,13 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
InstrItinClass Itinerary = itin;
}
class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
RegisterClass RCD, RegisterClass RCWS> :
MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
[(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
bit usesCustomInserter = 1;
}
class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
SplatComplexPattern SplatImm, RegisterClass RCWD,
RegisterClass RCWS = RCWD,
@ -1581,6 +1588,11 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
GPR32, MSA128W>;
class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
MSA128W>;
class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
MSA128D>;
class CTCMSA_DESC {
dag OutOperandList = (outs);
dag InOperandList = (ins MSACtrl:$cd, GPR32:$rs);
@ -2579,6 +2591,9 @@ def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;

View File

@ -827,6 +827,10 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
case Mips::SZ_V_PSEUDO:
return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
case Mips::COPY_FW_PSEUDO:
return emitCOPY_FW(MI, BB);
case Mips::COPY_FD_PSEUDO:
return emitCOPY_FD(MI, BB);
}
}
@ -1662,10 +1666,19 @@ lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT ResTy = Op->getValueType(0);
SDValue Op0 = Op->getOperand(0);
SDValue Op1 = Op->getOperand(1);
EVT EltTy = Op0->getValueType(0).getVectorElementType();
return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
DAG.getValueType(EltTy));
EVT VecTy = Op0->getValueType(0);
if (!VecTy.is128BitVector())
return SDValue();
if (ResTy.isInteger()) {
SDValue Op1 = Op->getOperand(1);
EVT EltTy = VecTy.getVectorElementType();
return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
DAG.getValueType(EltTy));
}
return Op;
}
static bool isConstantOrUndef(const SDValue Op) {
@ -2236,3 +2249,69 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
MI->eraseFromParent(); // The pseudo instruction is gone now.
return Sink;
}
// Emit the COPY_FW pseudo instruction.
//
// copy_fw_pseudo $fd, $ws, n
// =>
// copy_u_w $rt, $ws, $n
// mtc1 $rt, $fd
//
// When n is zero, the equivalent operation can be performed with (potentially)
// zero instructions due to register overlaps. This optimization is never valid
// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
MachineBasicBlock * MipsSETargetLowering::
emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned Fd = MI->getOperand(0).getReg();
unsigned Ws = MI->getOperand(1).getReg();
unsigned Lane = MI->getOperand(2).getImm();
if (Lane == 0)
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
else {
unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
}
MI->eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}
// Emit the COPY_FD pseudo instruction.
//
// copy_fd_pseudo $fd, $ws, n
// =>
// splati.d $wt, $ws, $n
// copy $fd, $wt:sub_64
//
// When n is zero, the equivalent operation can be performed with (potentially)
// zero instructions due to register overlaps. This optimization is always
// valid because FR=1 mode which is the only supported mode in MSA.
MachineBasicBlock * MipsSETargetLowering::
emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
assert(Subtarget->isFP64bit());
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
unsigned Fd = MI->getOperand(0).getReg();
unsigned Ws = MI->getOperand(1).getReg();
unsigned Lane = MI->getOperand(2).getImm() * 2;
DebugLoc DL = MI->getDebugLoc();
if (Lane == 0)
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
else {
unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
}
MI->eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}

View File

@ -84,6 +84,12 @@ namespace llvm {
MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
MachineBasicBlock *BB,
unsigned BranchOp) const;
/// \brief Emit the COPY_FW pseudo instruction
MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
MachineBasicBlock *BB) const;
/// \brief Emit the COPY_FD pseudo instruction
MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
MachineBasicBlock *BB) const;
};
}

View File

@ -55,3 +55,83 @@ define void @const_v2f64() nounwind {
ret void
; MIPS32: .size const_v2f64
}
define float @extract_v4f32() nounwind {
; MIPS32: extract_v4f32:
%1 = load <4 x float>* @v4f32
; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
%2 = fadd <4 x float> %1, %1
; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
%3 = extractelement <4 x float> %2, i32 1
; Element 1 can be obtained by splatting it across the vector and extracting
; $w0:sub_lo
; MIPS32-DAG: splati.w $w0, [[R1]][1]
ret float %3
; MIPS32: .size extract_v4f32
}
define float @extract_v4f32_elt0() nounwind {
; MIPS32: extract_v4f32_elt0:
%1 = load <4 x float>* @v4f32
; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
%2 = fadd <4 x float> %1, %1
; MIPS32-DAG: fadd.w $w0, [[R1]], [[R1]]
%3 = extractelement <4 x float> %2, i32 0
; Element 0 can be obtained by extracting $w0:sub_lo ($f0)
; MIPS32-NOT: copy_u.w
; MIPS32-NOT: mtc1
ret float %3
; MIPS32: .size extract_v4f32_elt0
}
define double @extract_v2f64() nounwind {
; MIPS32: extract_v2f64:
%1 = load <2 x double>* @v2f64
; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
%2 = fadd <2 x double> %1, %1
; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
%3 = extractelement <2 x double> %2, i32 1
; Element 1 can be obtained by splatting it across the vector and extracting
; $w0:sub_64
; MIPS32-DAG: splati.d $w0, [[R1]][1]
; MIPS32-NOT: copy_u.w
; MIPS32-NOT: mtc1
; MIPS32-NOT: mthc1
; MIPS32-NOT: sll
; MIPS32-NOT: sra
ret double %3
; MIPS32: .size extract_v2f64
}
define double @extract_v2f64_elt0() nounwind {
; MIPS32: extract_v2f64_elt0:
%1 = load <2 x double>* @v2f64
; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
%2 = fadd <2 x double> %1, %1
; MIPS32-DAG: fadd.d $w0, [[R1]], [[R1]]
%3 = extractelement <2 x double> %2, i32 0
; Element 0 can be obtained by extracting $w0:sub_64 ($f0)
; MIPS32-NOT: copy_u.w
; MIPS32-NOT: mtc1
; MIPS32-NOT: mthc1
; MIPS32-NOT: sll
; MIPS32-NOT: sra
ret double %3
; MIPS32: .size extract_v2f64_elt0
}

View File

@ -0,0 +1,141 @@
; RUN: llc -march=mips < %s
; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
; This test originally failed to select instructions for extract_vector_elt for
; v2f64 on MSA.
; It should at least successfully build.
define void @autogen_SD997348632(i8*, i32*, i64*, i32, i64, i8) {
BB:
%A4 = alloca <2 x i32>
%A3 = alloca <16 x i16>
%A2 = alloca <4 x i1>
%A1 = alloca <4 x i16>
%A = alloca <2 x i32>
%L = load i8* %0
store i8 %L, i8* %0
%E = extractelement <4 x i32> zeroinitializer, i32 0
%Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 1, i32 3, i32 5>
%I = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
%FC = sitofp <4 x i32> zeroinitializer to <4 x double>
%Sl = select i1 false, <4 x i64> %Shuff, <4 x i64> %Shuff
%L5 = load i8* %0
store i8 %5, i8* %0
%E6 = extractelement <1 x i16> zeroinitializer, i32 0
%Shuff7 = shufflevector <2 x i1> %I, <2 x i1> %I, <2 x i32> <i32 1, i32 undef>
%I8 = insertelement <1 x i16> zeroinitializer, i16 0, i32 0
%B = xor i32 376034, %3
%FC9 = fptoui float 0x406DB70180000000 to i64
%Sl10 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%Cmp = icmp ult <4 x i64> zeroinitializer, zeroinitializer
%L11 = load i8* %0
store i8 %L, i8* %0
%E12 = extractelement <4 x i64> zeroinitializer, i32 2
%Shuff13 = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 undef, i32 3>
%I14 = insertelement <8 x i32> zeroinitializer, i32 -1, i32 7
%B15 = fdiv <4 x double> %FC, %FC
%Tr = trunc i32 376034 to i16
%Sl16 = select i1 false, <8 x i32> %Sl10, <8 x i32> zeroinitializer
%Cmp17 = icmp uge i32 233658, %E
br label %CF
CF: ; preds = %CF, %CF79, %CF84, %BB
%L18 = load i8* %0
store i8 %L, i8* %0
%E19 = extractelement <4 x i64> %Sl, i32 3
%Shuff20 = shufflevector <2 x i1> %Shuff7, <2 x i1> %I, <2 x i32> <i32 2, i32 0>
%I21 = insertelement <4 x i64> zeroinitializer, i64 %FC9, i32 0
%B22 = xor <8 x i32> %I14, %I14
%Tr23 = trunc i16 0 to i8
%Sl24 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
%Cmp25 = icmp eq i1 false, false
br i1 %Cmp25, label %CF, label %CF79
CF79: ; preds = %CF
%L26 = load i8* %0
store i8 %L26, i8* %0
%E27 = extractelement <1 x i16> zeroinitializer, i32 0
%Shuff28 = shufflevector <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
%I29 = insertelement <16 x i32> %Shuff28, i32 %B, i32 15
%B30 = fdiv float 0.000000e+00, -6.749110e+06
%Sl31 = select i1 false, i32 %3, i32 %3
%Cmp32 = fcmp uno float 0.000000e+00, 0x406DB70180000000
br i1 %Cmp32, label %CF, label %CF78
CF78: ; preds = %CF78, %CF79
%L33 = load i8* %0
store i8 %L, i8* %0
%E34 = extractelement <16 x i32> %Shuff28, i32 1
%Shuff35 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I21, <4 x i32> <i32 undef, i32 6, i32 0, i32 2>
%I36 = insertelement <4 x double> %FC, double 0xA4A57F449CA36CC2, i32 2
%Se = sext <4 x i1> %Cmp to <4 x i32>
%Sl37 = select i1 %Cmp17, i32 0, i32 0
%Cmp38 = icmp ne i32 440284, 376034
br i1 %Cmp38, label %CF78, label %CF80
CF80: ; preds = %CF80, %CF82, %CF78
%L39 = load i8* %0
store i8 %L, i8* %0
%E40 = extractelement <2 x i1> %Shuff20, i32 1
br i1 %E40, label %CF80, label %CF82
CF82: ; preds = %CF80
%Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff20, <2 x i32> <i32 2, i32 0>
%I42 = insertelement <2 x i1> %Shuff41, i1 false, i32 0
%B43 = sub i32 %E, 0
%Sl44 = select i1 %Cmp32, <16 x i32> %Shuff28, <16 x i32> %Shuff28
%Cmp45 = icmp sgt <4 x i64> zeroinitializer, %I21
%L46 = load i8* %0
store i8 %L11, i8* %0
%E47 = extractelement <8 x i32> %Sl16, i32 4
%Shuff48 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff7, <2 x i32> <i32 undef, i32 1>
%I49 = insertelement <2 x i1> %Shuff48, i1 %Cmp17, i32 1
%B50 = and <8 x i32> %I14, %Sl10
%FC51 = fptoui float -6.749110e+06 to i1
br i1 %FC51, label %CF80, label %CF81
CF81: ; preds = %CF81, %CF82
%Sl52 = select i1 false, float -6.749110e+06, float 0x406DB70180000000
%Cmp53 = icmp uge <2 x i32> <i32 -1, i32 -1>, <i32 -1, i32 -1>
%L54 = load i8* %0
store i8 %L5, i8* %0
%E55 = extractelement <8 x i32> zeroinitializer, i32 7
%Shuff56 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 4, i32 6, i32 0>
%I57 = insertelement <2 x i1> %Shuff7, i1 false, i32 0
%B58 = fmul <4 x double> %FC, %FC
%FC59 = fptoui <4 x double> %I36 to <4 x i16>
%Sl60 = select i1 %Cmp17, <2 x i1> %I, <2 x i1> %I57
%Cmp61 = icmp ule <8 x i32> %B50, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%L62 = load i8* %0
store i8 %L33, i8* %0
%E63 = extractelement <4 x i64> %Shuff, i32 2
%Shuff64 = shufflevector <4 x i64> %Shuff56, <4 x i64> %Shuff56, <4 x i32> <i32 5, i32 7, i32 1, i32 undef>
%I65 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
%B66 = sdiv i32 %B, %E55
%Tr67 = trunc i8 %L54 to i1
br i1 %Tr67, label %CF81, label %CF83
CF83: ; preds = %CF83, %CF81
%Sl68 = select i1 %Cmp17, i1 %Cmp25, i1 %Tr67
br i1 %Sl68, label %CF83, label %CF84
CF84: ; preds = %CF83
%Cmp69 = icmp uge i32 %E, %E34
br i1 %Cmp69, label %CF, label %CF77
CF77: ; preds = %CF84
%L70 = load i8* %0
store i8 %L, i8* %0
%E71 = extractelement <4 x i64> %Shuff, i32 0
%Shuff72 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> <i32 3, i32 1>
%I73 = insertelement <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 %B66, i32 1
%FC74 = uitofp i1 %Cmp32 to double
%Sl75 = select i1 %FC51, i16 9704, i16 0
%Cmp76 = icmp ugt <1 x i16> %I8, %I8
store i8 %L39, i8* %0
store i8 %5, i8* %0
store i8 %Tr23, i8* %0
store i8 %L, i8* %0
store i8 %5, i8* %0
ret void
}