diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td index 7ed2f841692..13b5775e168 100644 --- a/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/lib/Target/Mips/MipsMSAInstrInfo.td @@ -1076,6 +1076,13 @@ class MSA_COPY_DESC_BASE : + MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n), + [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> { + bit usesCustomInserter = 1; +} + class MSA_I5_DESC_BASE; +class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE; +class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE; + class CTCMSA_DESC { dag OutOperandList = (outs); dag InOperandList = (ins MSACtrl:$cd, GPR32:$rs); @@ -2579,6 +2591,9 @@ def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC; def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC; def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC; +def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC; +def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC; + def CTCMSA : CTCMSA_ENC, CTCMSA_DESC; def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC; diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index 84db5ceb273..99c7019cbde 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -827,6 +827,10 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitMSACBranchPseudo(MI, BB, Mips::BZ_D); case Mips::SZ_V_PSEUDO: return emitMSACBranchPseudo(MI, BB, Mips::BZ_V); + case Mips::COPY_FW_PSEUDO: + return emitCOPY_FW(MI, BB); + case Mips::COPY_FD_PSEUDO: + return emitCOPY_FD(MI, BB); } } @@ -1662,10 +1666,19 @@ lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT ResTy = Op->getValueType(0); SDValue Op0 = Op->getOperand(0); - SDValue Op1 = Op->getOperand(1); - EVT EltTy = Op0->getValueType(0).getVectorElementType(); - return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1, - DAG.getValueType(EltTy)); + EVT VecTy = Op0->getValueType(0); + + if (!VecTy.is128BitVector()) + return SDValue(); + + if (ResTy.isInteger()) { + SDValue Op1 = Op->getOperand(1); + EVT EltTy = VecTy.getVectorElementType(); + return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1, + DAG.getValueType(EltTy)); + } + + return Op; } static bool isConstantOrUndef(const SDValue Op) { @@ -2236,3 +2249,69 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB, MI->eraseFromParent(); // The pseudo instruction is gone now. return Sink; } + +// Emit the COPY_FW pseudo instruction. +// +// copy_fw_pseudo $fd, $ws, n +// => +// copy_u_w $rt, $ws, $n +// mtc1 $rt, $fd +// +// When n is zero, the equivalent operation can be performed with (potentially) +// zero instructions due to register overlaps. This optimization is never valid +// for lane 1 because it would require FR=0 mode which isn't supported by MSA. +MachineBasicBlock * MipsSETargetLowering:: +emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{ + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Fd = MI->getOperand(0).getReg(); + unsigned Ws = MI->getOperand(1).getReg(); + unsigned Lane = MI->getOperand(2).getImm(); + + if (Lane == 0) + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo); + else { + unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); + + BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1); + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +// Emit the COPY_FD pseudo instruction. +// +// copy_fd_pseudo $fd, $ws, n +// => +// splati.d $wt, $ws, $n +// copy $fd, $wt:sub_64 +// +// When n is zero, the equivalent operation can be performed with (potentially) +// zero instructions due to register overlaps. This optimization is always +// valid because FR=1 mode which is the only supported mode in MSA. +MachineBasicBlock * MipsSETargetLowering:: +emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{ + assert(Subtarget->isFP64bit()); + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); + unsigned Fd = MI->getOperand(0).getReg(); + unsigned Ws = MI->getOperand(1).getReg(); + unsigned Lane = MI->getOperand(2).getImm() * 2; + DebugLoc DL = MI->getDebugLoc(); + + if (Lane == 0) + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64); + else { + unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass); + + BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1); + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h index 9b69fb5dc2a..03f1cc54a68 100644 --- a/lib/Target/Mips/MipsSEISelLowering.h +++ b/lib/Target/Mips/MipsSEISelLowering.h @@ -84,6 +84,12 @@ namespace llvm { MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB, unsigned BranchOp) const; + /// \brief Emit the COPY_FW pseudo instruction + MachineBasicBlock *emitCOPY_FW(MachineInstr *MI, + MachineBasicBlock *BB) const; + /// \brief Emit the COPY_FD pseudo instruction + MachineBasicBlock *emitCOPY_FD(MachineInstr *MI, + MachineBasicBlock *BB) const; }; } diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll index 7090138ed1e..0b293852a61 100644 --- a/test/CodeGen/Mips/msa/basic_operations_float.ll +++ b/test/CodeGen/Mips/msa/basic_operations_float.ll @@ -55,3 +55,83 @@ define void @const_v2f64() nounwind { ret void ; MIPS32: .size const_v2f64 } + +define float @extract_v4f32() nounwind { + ; MIPS32: extract_v4f32: + + %1 = load <4 x float>* @v4f32 + ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], + + %2 = fadd <4 x float> %1, %1 + ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] + + %3 = extractelement <4 x float> %2, i32 1 + ; Element 1 can be obtained by splatting it across the vector and extracting + ; $w0:sub_lo + ; MIPS32-DAG: splati.w $w0, [[R1]][1] + + ret float %3 + ; MIPS32: .size extract_v4f32 +} + +define float @extract_v4f32_elt0() nounwind { + ; MIPS32: extract_v4f32_elt0: + + %1 = load <4 x float>* @v4f32 + ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], + + %2 = fadd <4 x float> %1, %1 + ; MIPS32-DAG: fadd.w $w0, [[R1]], [[R1]] + + %3 = extractelement <4 x float> %2, i32 0 + ; Element 0 can be obtained by extracting $w0:sub_lo ($f0) + ; MIPS32-NOT: copy_u.w + ; MIPS32-NOT: mtc1 + + ret float %3 + ; MIPS32: .size extract_v4f32_elt0 +} + +define double @extract_v2f64() nounwind { + ; MIPS32: extract_v2f64: + + %1 = load <2 x double>* @v2f64 + ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], + + %2 = fadd <2 x double> %1, %1 + ; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] + + %3 = extractelement <2 x double> %2, i32 1 + ; Element 1 can be obtained by splatting it across the vector and extracting + ; $w0:sub_64 + ; MIPS32-DAG: splati.d $w0, [[R1]][1] + ; MIPS32-NOT: copy_u.w + ; MIPS32-NOT: mtc1 + ; MIPS32-NOT: mthc1 + ; MIPS32-NOT: sll + ; MIPS32-NOT: sra + + ret double %3 + ; MIPS32: .size extract_v2f64 +} + +define double @extract_v2f64_elt0() nounwind { + ; MIPS32: extract_v2f64_elt0: + + %1 = load <2 x double>* @v2f64 + ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], + + %2 = fadd <2 x double> %1, %1 + ; MIPS32-DAG: fadd.d $w0, [[R1]], [[R1]] + + %3 = extractelement <2 x double> %2, i32 0 + ; Element 0 can be obtained by extracting $w0:sub_64 ($f0) + ; MIPS32-NOT: copy_u.w + ; MIPS32-NOT: mtc1 + ; MIPS32-NOT: mthc1 + ; MIPS32-NOT: sll + ; MIPS32-NOT: sra + + ret double %3 + ; MIPS32: .size extract_v2f64_elt0 +} diff --git a/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll new file mode 100644 index 00000000000..399d3a05011 --- /dev/null +++ b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll @@ -0,0 +1,141 @@ +; RUN: llc -march=mips < %s +; RUN: llc -march=mips -mattr=+msa,+fp64 < %s + +; This test originally failed to select instructions for extract_vector_elt for +; v2f64 on MSA. +; It should at least successfully build. + +define void @autogen_SD997348632(i8*, i32*, i64*, i32, i64, i8) { +BB: + %A4 = alloca <2 x i32> + %A3 = alloca <16 x i16> + %A2 = alloca <4 x i1> + %A1 = alloca <4 x i16> + %A = alloca <2 x i32> + %L = load i8* %0 + store i8 %L, i8* %0 + %E = extractelement <4 x i32> zeroinitializer, i32 0 + %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> + %I = insertelement <2 x i1> zeroinitializer, i1 false, i32 1 + %FC = sitofp <4 x i32> zeroinitializer to <4 x double> + %Sl = select i1 false, <4 x i64> %Shuff, <4 x i64> %Shuff + %L5 = load i8* %0 + store i8 %5, i8* %0 + %E6 = extractelement <1 x i16> zeroinitializer, i32 0 + %Shuff7 = shufflevector <2 x i1> %I, <2 x i1> %I, <2 x i32> + %I8 = insertelement <1 x i16> zeroinitializer, i16 0, i32 0 + %B = xor i32 376034, %3 + %FC9 = fptoui float 0x406DB70180000000 to i64 + %Sl10 = select i1 false, <8 x i32> , <8 x i32> + %Cmp = icmp ult <4 x i64> zeroinitializer, zeroinitializer + %L11 = load i8* %0 + store i8 %L, i8* %0 + %E12 = extractelement <4 x i64> zeroinitializer, i32 2 + %Shuff13 = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> + %I14 = insertelement <8 x i32> zeroinitializer, i32 -1, i32 7 + %B15 = fdiv <4 x double> %FC, %FC + %Tr = trunc i32 376034 to i16 + %Sl16 = select i1 false, <8 x i32> %Sl10, <8 x i32> zeroinitializer + %Cmp17 = icmp uge i32 233658, %E + br label %CF + +CF: ; preds = %CF, %CF79, %CF84, %BB + %L18 = load i8* %0 + store i8 %L, i8* %0 + %E19 = extractelement <4 x i64> %Sl, i32 3 + %Shuff20 = shufflevector <2 x i1> %Shuff7, <2 x i1> %I, <2 x i32> + %I21 = insertelement <4 x i64> zeroinitializer, i64 %FC9, i32 0 + %B22 = xor <8 x i32> %I14, %I14 + %Tr23 = trunc i16 0 to i8 + %Sl24 = select i1 false, <8 x i32> , <8 x i32> zeroinitializer + %Cmp25 = icmp eq i1 false, false + br i1 %Cmp25, label %CF, label %CF79 + +CF79: ; preds = %CF + %L26 = load i8* %0 + store i8 %L26, i8* %0 + %E27 = extractelement <1 x i16> zeroinitializer, i32 0 + %Shuff28 = shufflevector <16 x i32> , <16 x i32> , <16 x i32> + %I29 = insertelement <16 x i32> %Shuff28, i32 %B, i32 15 + %B30 = fdiv float 0.000000e+00, -6.749110e+06 + %Sl31 = select i1 false, i32 %3, i32 %3 + %Cmp32 = fcmp uno float 0.000000e+00, 0x406DB70180000000 + br i1 %Cmp32, label %CF, label %CF78 + +CF78: ; preds = %CF78, %CF79 + %L33 = load i8* %0 + store i8 %L, i8* %0 + %E34 = extractelement <16 x i32> %Shuff28, i32 1 + %Shuff35 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I21, <4 x i32> + %I36 = insertelement <4 x double> %FC, double 0xA4A57F449CA36CC2, i32 2 + %Se = sext <4 x i1> %Cmp to <4 x i32> + %Sl37 = select i1 %Cmp17, i32 0, i32 0 + %Cmp38 = icmp ne i32 440284, 376034 + br i1 %Cmp38, label %CF78, label %CF80 + +CF80: ; preds = %CF80, %CF82, %CF78 + %L39 = load i8* %0 + store i8 %L, i8* %0 + %E40 = extractelement <2 x i1> %Shuff20, i32 1 + br i1 %E40, label %CF80, label %CF82 + +CF82: ; preds = %CF80 + %Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff20, <2 x i32> + %I42 = insertelement <2 x i1> %Shuff41, i1 false, i32 0 + %B43 = sub i32 %E, 0 + %Sl44 = select i1 %Cmp32, <16 x i32> %Shuff28, <16 x i32> %Shuff28 + %Cmp45 = icmp sgt <4 x i64> zeroinitializer, %I21 + %L46 = load i8* %0 + store i8 %L11, i8* %0 + %E47 = extractelement <8 x i32> %Sl16, i32 4 + %Shuff48 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff7, <2 x i32> + %I49 = insertelement <2 x i1> %Shuff48, i1 %Cmp17, i32 1 + %B50 = and <8 x i32> %I14, %Sl10 + %FC51 = fptoui float -6.749110e+06 to i1 + br i1 %FC51, label %CF80, label %CF81 + +CF81: ; preds = %CF81, %CF82 + %Sl52 = select i1 false, float -6.749110e+06, float 0x406DB70180000000 + %Cmp53 = icmp uge <2 x i32> , + %L54 = load i8* %0 + store i8 %L5, i8* %0 + %E55 = extractelement <8 x i32> zeroinitializer, i32 7 + %Shuff56 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> + %I57 = insertelement <2 x i1> %Shuff7, i1 false, i32 0 + %B58 = fmul <4 x double> %FC, %FC + %FC59 = fptoui <4 x double> %I36 to <4 x i16> + %Sl60 = select i1 %Cmp17, <2 x i1> %I, <2 x i1> %I57 + %Cmp61 = icmp ule <8 x i32> %B50, + %L62 = load i8* %0 + store i8 %L33, i8* %0 + %E63 = extractelement <4 x i64> %Shuff, i32 2 + %Shuff64 = shufflevector <4 x i64> %Shuff56, <4 x i64> %Shuff56, <4 x i32> + %I65 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1 + %B66 = sdiv i32 %B, %E55 + %Tr67 = trunc i8 %L54 to i1 + br i1 %Tr67, label %CF81, label %CF83 + +CF83: ; preds = %CF83, %CF81 + %Sl68 = select i1 %Cmp17, i1 %Cmp25, i1 %Tr67 + br i1 %Sl68, label %CF83, label %CF84 + +CF84: ; preds = %CF83 + %Cmp69 = icmp uge i32 %E, %E34 + br i1 %Cmp69, label %CF, label %CF77 + +CF77: ; preds = %CF84 + %L70 = load i8* %0 + store i8 %L, i8* %0 + %E71 = extractelement <4 x i64> %Shuff, i32 0 + %Shuff72 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> + %I73 = insertelement <8 x i32> , i32 %B66, i32 1 + %FC74 = uitofp i1 %Cmp32 to double + %Sl75 = select i1 %FC51, i16 9704, i16 0 + %Cmp76 = icmp ugt <1 x i16> %I8, %I8 + store i8 %L39, i8* %0 + store i8 %5, i8* %0 + store i8 %Tr23, i8* %0 + store i8 %L, i8* %0 + store i8 %5, i8* %0 + ret void +}