[mips][msa] Implemented extract_vector_elt for v4f32 or v2f64

For v4f32 and v2f64, EXTRACT_VECTOR_ELT is matched by a pseudo-insn which may be expanded to subregister copies and/or instructions as appropriate. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191514 91177308-0d34-0410-b5e6-96231b3b80d8
2025-08-10 18:26:02 +00:00 · 2013-09-27 12:17:32 +00:00
parent 4cc117883d
commit b4691b495d
5 changed files with 325 additions and 4 deletions
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -1076,6 +1076,13 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
  InstrItinClass Itinerary = itin;
 }

+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+                           RegisterClass RCD, RegisterClass RCWS> :
+      MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
+                 [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
+  bit usesCustomInserter = 1;
+}
+
 class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       SplatComplexPattern SplatImm, RegisterClass RCWD,
                       RegisterClass RCWS = RCWD,
@@ -1581,6 +1588,11 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
 class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
                                         GPR32, MSA128W>;

+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
+                                                 MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
+                                                 MSA128D>;
+
 class CTCMSA_DESC {
  dag OutOperandList = (outs);
  dag InOperandList = (ins MSACtrl:$cd, GPR32:$rs);
@@ -2579,6 +2591,9 @@ def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
 def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
 def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;

+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
 def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;

 def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -827,6 +827,10 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
  case Mips::SZ_V_PSEUDO:
    return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+  case Mips::COPY_FW_PSEUDO:
+    return emitCOPY_FW(MI, BB);
+  case Mips::COPY_FD_PSEUDO:
+    return emitCOPY_FD(MI, BB);
  }
 }

@@ -1662,10 +1666,19 @@ lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
  SDLoc DL(Op);
  EVT ResTy = Op->getValueType(0);
  SDValue Op0 = Op->getOperand(0);
-  SDValue Op1 = Op->getOperand(1);
-  EVT EltTy = Op0->getValueType(0).getVectorElementType();
-  return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
-                     DAG.getValueType(EltTy));
+  EVT VecTy = Op0->getValueType(0);
+
+  if (!VecTy.is128BitVector())
+    return SDValue();
+
+  if (ResTy.isInteger()) {
+    SDValue Op1 = Op->getOperand(1);
+    EVT EltTy = VecTy.getVectorElementType();
+    return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+                       DAG.getValueType(EltTy));
+  }
+
+  return Op;
 }

 static bool isConstantOrUndef(const SDValue Op) {
@@ -2236,3 +2249,69 @@ emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
  MI->eraseFromParent();   // The pseudo instruction is gone now.
  return Sink;
 }
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1     $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Fd = MI->getOperand(0).getReg();
+  unsigned Ws = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Fd  = MI->getOperand(0).getReg();
+  unsigned Ws  = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm() * 2;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -84,6 +84,12 @@ namespace llvm {
    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
                                            MachineBasicBlock *BB,
                                            unsigned BranchOp) const;
+    /// \brief Emit the COPY_FW pseudo instruction
+    MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the COPY_FD pseudo instruction
+    MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
  };
 }

--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -55,3 +55,83 @@ define void @const_v2f64() nounwind {
  ret void
  ; MIPS32: .size const_v2f64
 }
+
+define float @extract_v4f32() nounwind {
+  ; MIPS32: extract_v4f32:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 1
+  ; Element 1 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_lo
+  ; MIPS32-DAG: splati.w $w0, [[R1]][1]
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32
+}
+
+define float @extract_v4f32_elt0() nounwind {
+  ; MIPS32: extract_v4f32_elt0:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w $w0, [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 0
+  ; Element 0 can be obtained by extracting $w0:sub_lo ($f0)
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32_elt0
+}
+
+define double @extract_v2f64() nounwind {
+  ; MIPS32: extract_v2f64:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x double> %2, i32 1
+  ; Element 1 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_64
+  ; MIPS32-DAG: splati.d $w0, [[R1]][1]
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+  ; MIPS32-NOT: mthc1
+  ; MIPS32-NOT: sll
+  ; MIPS32-NOT: sra
+
+  ret double %3
+  ; MIPS32: .size extract_v2f64
+}
+
+define double @extract_v2f64_elt0() nounwind {
+  ; MIPS32: extract_v2f64_elt0:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d $w0, [[R1]], [[R1]]
+
+  %3 = extractelement <2 x double> %2, i32 0
+  ; Element 0 can be obtained by extracting $w0:sub_64 ($f0)
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+  ; MIPS32-NOT: mthc1
+  ; MIPS32-NOT: sll
+  ; MIPS32-NOT: sra
+
+  ret double %3
+  ; MIPS32: .size extract_v2f64_elt0
+}
--- a/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll
+++ b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll
@@ -0,0 +1,141 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select instructions for extract_vector_elt for
+; v2f64 on MSA.
+; It should at least successfully build.
+
+define void @autogen_SD997348632(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <2 x i32>
+  %A3 = alloca <16 x i16>
+  %A2 = alloca <4 x i1>
+  %A1 = alloca <4 x i16>
+  %A = alloca <2 x i32>
+  %L = load i8* %0
+  store i8 %L, i8* %0
+  %E = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 1, i32 3, i32 5>
+  %I = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %FC = sitofp <4 x i32> zeroinitializer to <4 x double>
+  %Sl = select i1 false, <4 x i64> %Shuff, <4 x i64> %Shuff
+  %L5 = load i8* %0
+  store i8 %5, i8* %0
+  %E6 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff7 = shufflevector <2 x i1> %I, <2 x i1> %I, <2 x i32> <i32 1, i32 undef>
+  %I8 = insertelement <1 x i16> zeroinitializer, i16 0, i32 0
+  %B = xor i32 376034, %3
+  %FC9 = fptoui float 0x406DB70180000000 to i64
+  %Sl10 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %Cmp = icmp ult <4 x i64> zeroinitializer, zeroinitializer
+  %L11 = load i8* %0
+  store i8 %L, i8* %0
+  %E12 = extractelement <4 x i64> zeroinitializer, i32 2
+  %Shuff13 = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 undef, i32 3>
+  %I14 = insertelement <8 x i32> zeroinitializer, i32 -1, i32 7
+  %B15 = fdiv <4 x double> %FC, %FC
+  %Tr = trunc i32 376034 to i16
+  %Sl16 = select i1 false, <8 x i32> %Sl10, <8 x i32> zeroinitializer
+  %Cmp17 = icmp uge i32 233658, %E
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF79, %CF84, %BB
+  %L18 = load i8* %0
+  store i8 %L, i8* %0
+  %E19 = extractelement <4 x i64> %Sl, i32 3
+  %Shuff20 = shufflevector <2 x i1> %Shuff7, <2 x i1> %I, <2 x i32> <i32 2, i32 0>
+  %I21 = insertelement <4 x i64> zeroinitializer, i64 %FC9, i32 0
+  %B22 = xor <8 x i32> %I14, %I14
+  %Tr23 = trunc i16 0 to i8
+  %Sl24 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
+  %Cmp25 = icmp eq i1 false, false
+  br i1 %Cmp25, label %CF, label %CF79
+
+CF79:                                             ; preds = %CF
+  %L26 = load i8* %0
+  store i8 %L26, i8* %0
+  %E27 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff28 = shufflevector <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %I29 = insertelement <16 x i32> %Shuff28, i32 %B, i32 15
+  %B30 = fdiv float 0.000000e+00, -6.749110e+06
+  %Sl31 = select i1 false, i32 %3, i32 %3
+  %Cmp32 = fcmp uno float 0.000000e+00, 0x406DB70180000000
+  br i1 %Cmp32, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF79
+  %L33 = load i8* %0
+  store i8 %L, i8* %0
+  %E34 = extractelement <16 x i32> %Shuff28, i32 1
+  %Shuff35 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I21, <4 x i32> <i32 undef, i32 6, i32 0, i32 2>
+  %I36 = insertelement <4 x double> %FC, double 0xA4A57F449CA36CC2, i32 2
+  %Se = sext <4 x i1> %Cmp to <4 x i32>
+  %Sl37 = select i1 %Cmp17, i32 0, i32 0
+  %Cmp38 = icmp ne i32 440284, 376034
+  br i1 %Cmp38, label %CF78, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF82, %CF78
+  %L39 = load i8* %0
+  store i8 %L, i8* %0
+  %E40 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E40, label %CF80, label %CF82
+
+CF82:                                             ; preds = %CF80
+  %Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff20, <2 x i32> <i32 2, i32 0>
+  %I42 = insertelement <2 x i1> %Shuff41, i1 false, i32 0
+  %B43 = sub i32 %E, 0
+  %Sl44 = select i1 %Cmp32, <16 x i32> %Shuff28, <16 x i32> %Shuff28
+  %Cmp45 = icmp sgt <4 x i64> zeroinitializer, %I21
+  %L46 = load i8* %0
+  store i8 %L11, i8* %0
+  %E47 = extractelement <8 x i32> %Sl16, i32 4
+  %Shuff48 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff7, <2 x i32> <i32 undef, i32 1>
+  %I49 = insertelement <2 x i1> %Shuff48, i1 %Cmp17, i32 1
+  %B50 = and <8 x i32> %I14, %Sl10
+  %FC51 = fptoui float -6.749110e+06 to i1
+  br i1 %FC51, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF81, %CF82
+  %Sl52 = select i1 false, float -6.749110e+06, float 0x406DB70180000000
+  %Cmp53 = icmp uge <2 x i32> <i32 -1, i32 -1>, <i32 -1, i32 -1>
+  %L54 = load i8* %0
+  store i8 %L5, i8* %0
+  %E55 = extractelement <8 x i32> zeroinitializer, i32 7
+  %Shuff56 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 4, i32 6, i32 0>
+  %I57 = insertelement <2 x i1> %Shuff7, i1 false, i32 0
+  %B58 = fmul <4 x double> %FC, %FC
+  %FC59 = fptoui <4 x double> %I36 to <4 x i16>
+  %Sl60 = select i1 %Cmp17, <2 x i1> %I, <2 x i1> %I57
+  %Cmp61 = icmp ule <8 x i32> %B50, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %L62 = load i8* %0
+  store i8 %L33, i8* %0
+  %E63 = extractelement <4 x i64> %Shuff, i32 2
+  %Shuff64 = shufflevector <4 x i64> %Shuff56, <4 x i64> %Shuff56, <4 x i32> <i32 5, i32 7, i32 1, i32 undef>
+  %I65 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %B66 = sdiv i32 %B, %E55
+  %Tr67 = trunc i8 %L54 to i1
+  br i1 %Tr67, label %CF81, label %CF83
+
+CF83:                                             ; preds = %CF83, %CF81
+  %Sl68 = select i1 %Cmp17, i1 %Cmp25, i1 %Tr67
+  br i1 %Sl68, label %CF83, label %CF84
+
+CF84:                                             ; preds = %CF83
+  %Cmp69 = icmp uge i32 %E, %E34
+  br i1 %Cmp69, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF84
+  %L70 = load i8* %0
+  store i8 %L, i8* %0
+  %E71 = extractelement <4 x i64> %Shuff, i32 0
+  %Shuff72 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> <i32 3, i32 1>
+  %I73 = insertelement <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 %B66, i32 1
+  %FC74 = uitofp i1 %Cmp32 to double
+  %Sl75 = select i1 %FC51, i16 9704, i16 0
+  %Cmp76 = icmp ugt <1 x i16> %I8, %I8
+  store i8 %L39, i8* %0
+  store i8 %5, i8* %0
+  store i8 %Tr23, i8* %0
+  store i8 %L, i8* %0
+  store i8 %5, i8* %0
+  ret void
+}