Careful with reg_sequence coalescing to not to overwrite sub-register indices.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@103971 91177308-0d34-0410-b5e6-96231b3b80d8
2025-05-13 01:15:32 +00:00 · 2010-05-17 20:57:12 +00:00 · 2010-05-17 20:57:12 +00:00 · 53c779bb3a
commit 53c779bb3a
parent 7f43fd84db
2 changed files with 119 additions and 40 deletions
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@ -128,6 +128,8 @@ namespace {
    void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB,
                     SmallPtrSet<MachineInstr*, 8> &Processed);

+    void CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs, unsigned DstReg);
+
    /// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
    /// of the de-ssa process. This replaces sources of REG_SEQUENCE as
    /// sub-register references of the register defined by REG_SEQUENCE.
@ -1132,7 +1134,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
 }

 static void UpdateRegSequenceSrcs(unsigned SrcReg,
-                                  unsigned DstReg, unsigned SrcIdx,
+                                  unsigned DstReg, unsigned SubIdx,
                                  MachineRegisterInfo *MRI) {
  for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
         RE = MRI->reg_end(); RI != RE; ) {
@ -1140,7 +1142,77 @@ static void UpdateRegSequenceSrcs(unsigned SrcReg,
    ++RI;
    MO.setReg(DstReg);
    assert(MO.getSubReg() == 0);
-    MO.setSubReg(SrcIdx);
+    MO.setSubReg(SubIdx);
+  }
+}
+
+/// CoalesceExtSubRegs - If a number of sources of the REG_SEQUENCE are
+/// EXTRACT_SUBREG from the same register and to the same virtual register
+/// with different sub-register indices, attempt to combine the
+/// EXTRACT_SUBREGs and pre-coalesce them. e.g.
+/// %reg1026<def> = VLDMQ %reg1025<kill>, 260, pred:14, pred:%reg0
+/// %reg1029:6<def> = EXTRACT_SUBREG %reg1026, 6
+/// %reg1029:5<def> = EXTRACT_SUBREG %reg1026<kill>, 5
+/// Since D subregs 5, 6 can combine to a Q register, we can coalesce
+/// reg1026 to reg1029.
+void
+TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
+                                              unsigned DstReg) {
+  SmallSet<unsigned, 4> Seen;
+  for (unsigned i = 0, e = Srcs.size(); i != e; ++i) {
+    unsigned SrcReg = Srcs[i];
+    if (!Seen.insert(SrcReg))
+      continue;
+
+    // If there are no other uses than extract_subreg which feed into
+    // the reg_sequence, then we might be able to coalesce them.
+    bool CanCoalesce = true;
+    SmallVector<unsigned, 4> SubIndices;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineInstr *UseMI = &*UI;
+      if (!UseMI->isExtractSubreg() ||
+          UseMI->getOperand(0).getReg() != DstReg) {
+        CanCoalesce = false;
+        break;
+      }
+      SubIndices.push_back(UseMI->getOperand(2).getImm());
+    }
+
+    if (!CanCoalesce || SubIndices.size() < 2)
+      continue;
+
+    std::sort(SubIndices.begin(), SubIndices.end());
+    unsigned NewSubIdx = 0;
+    if (TRI->canCombinedSubRegIndex(MRI->getRegClass(SrcReg), SubIndices,
+                                    NewSubIdx)) {
+      bool Proceed = true;
+      if (NewSubIdx)
+        for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
+               RE = MRI->reg_end(); RI != RE; ) {
+          MachineOperand &MO = RI.getOperand();
+          ++RI;
+          // FIXME: If the sub-registers do not combine to the whole
+          // super-register, i.e. NewSubIdx != 0, and any of the use has a
+          // sub-register index, then abort the coalescing attempt.
+          if (MO.getSubReg()) {
+            Proceed = false;
+            break;
+          }
+          MO.setReg(DstReg);
+          MO.setSubReg(NewSubIdx);
+        }
+      if (Proceed)
+        for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
+               RE = MRI->reg_end(); RI != RE; ) {
+          MachineOperand &MO = RI.getOperand();
+          ++RI;
+          MO.setReg(DstReg);
+          if (NewSubIdx)
+            MO.setSubReg(NewSubIdx);
+        }
+      }
  }
 }

@ -1221,50 +1293,15 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {

    for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
      unsigned SrcReg = MI->getOperand(i).getReg();
-      unsigned SrcIdx = MI->getOperand(i+1).getImm();
-      UpdateRegSequenceSrcs(SrcReg, DstReg, SrcIdx, MRI);
+      unsigned SubIdx = MI->getOperand(i+1).getImm();
+      UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI);
    }

    DEBUG(dbgs() << "Eliminated: " << *MI);
    MI->eraseFromParent();

    // Try coalescing some EXTRACT_SUBREG instructions.
-    Seen.clear();
-    for (unsigned i = 0, e = RealSrcs.size(); i != e; ++i) {
-      unsigned SrcReg = RealSrcs[i];
-      if (!Seen.insert(SrcReg))
-        continue;
-
-      // If there are no other uses than extract_subreg which feed into
-      // the reg_sequence, then we might be able to coalesce them.
-      bool CanCoalesce = true;
-      SmallVector<unsigned, 4> SubIndices;
-      for (MachineRegisterInfo::use_nodbg_iterator
-             UI = MRI->use_nodbg_begin(SrcReg),
-             UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-        MachineInstr *UseMI = &*UI;
-        if (!UseMI->isExtractSubreg() ||
-            UseMI->getOperand(0).getReg() != DstReg) {
-          CanCoalesce = false;
-          break;
-        }
-        SubIndices.push_back(UseMI->getOperand(2).getImm());
-      }
-
-      if (!CanCoalesce)
-        continue;
-
-      // %reg1026<def> = VLDMQ %reg1025<kill>, 260, pred:14, pred:%reg0
-      // %reg1029:6<def> = EXTRACT_SUBREG %reg1026, 6
-      // %reg1029:5<def> = EXTRACT_SUBREG %reg1026<kill>, 5
-      // Since D subregs 5, 6 can combine to a Q register, we can coalesce
-      // reg1026 to reg1029.
-      std::sort(SubIndices.begin(), SubIndices.end());
-      unsigned NewSubIdx = 0;
-      if (TRI->canCombinedSubRegIndex(MRI->getRegClass(SrcReg), SubIndices,
-                                      NewSubIdx))
-        UpdateRegSequenceSrcs(SrcReg, DstReg, NewSubIdx, MRI);
-    }
+    CoalesceExtSubRegs(RealSrcs, DstReg);
  }

  RegSequences.clear();
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@ -3,6 +3,7 @@

 %struct.int16x8_t = type { <8 x i16> }
 %struct.int32x4_t = type { <4 x i32> }
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
@ -149,12 +150,51 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
  ret <8 x i16> %tmp5
 }

+define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
+; CHECK:        t6:
+; CHECK:        vldr.64
+; CHECK:        vmov d1, d0
+; CHECK-NEXT:   vld2.8 {d0[1], d1[1]}
+  %tmp1 = load <8 x i8>* %B                       ; <<8 x i8>> [#uses=2]
+  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+  %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
+  %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
+  %tmp5 = add <8 x i8> %tmp3, %tmp4               ; <<8 x i8>> [#uses=1]
+  ret <8 x i8> %tmp5
+}
+
+define arm_apcscc void @t7(i32* %iptr, i32* %optr) nounwind {
+entry:
+; CHECK:        t7:
+; CHECK:        vld2.32
+; CHECK:        vst2.32
+; CHECK:        vld1.32 {d0, d1},
+; CHECK:        vmov q1, q0
+; CHECK-NOT:    vmov
+; CHECK:        vuzp.32 q0, q1
+; CHECK:        vst1.32
+  %0 = bitcast i32* %iptr to i8*                  ; <i8*> [#uses=2]
+  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
+  %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
+  %2 = bitcast i32* %optr to i8*                  ; <i8*> [#uses=2]
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60)
+  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1]
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4)
+  ret void
+}
+
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
+
 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly

 declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone

 declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone

+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
+
 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind

 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
@ -163,6 +203,8 @@ declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonl

 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly

+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
+
 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly

 declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind