DAGCombine's logic for forming pre- and post- indexed loads / stores were being

overly conservative. It was concerned about cases where it would prohibit
folding simple [r, c] addressing modes. e.g.
  ldr r0, [r2]
  ldr r1, [r2, #4]
=>
  ldr r0, [r2], #4
  ldr r1, [r2]
Change the logic to look for such cases which allows it to form indexed memory
ops more aggressively.

rdar://10674430


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@148086 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evan Cheng 2012-01-13 01:37:24 +00:00
parent b4ee5168ab
commit c4b527ac06
3 changed files with 143 additions and 13 deletions

View File

@ -5984,6 +5984,44 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) {
return SDValue();
}
/// canFoldInAddressingMode - Return true if 'Use' is a load or a store that
/// uses N as its base pointer and that N may be folded in the load / store
/// addressing mode. FIXME: This currently only looks for folding of
/// [reg +/- imm] addressing modes.
static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
SelectionDAG &DAG,
const TargetLowering &TLI) {
EVT VT;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
return false;
VT = Use->getValueType(0);
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
return false;
VT = ST->getValue().getValueType();
} else
return false;
TargetLowering::AddrMode AM;
if (N->getOpcode() == ISD::ADD) {
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (Offset)
AM.BaseOffs = Offset->getSExtValue();
else
return false;
} else if (N->getOpcode() == ISD::SUB) {
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (Offset)
AM.BaseOffs = -Offset->getSExtValue();
else
return false;
} else
return false;
return TLI.isLegalAddressingMode(AM, VT.getTypeForEVT(*DAG.getContext()));
}
/// CombineToPreIndexedLoadStore - Try turning a load / store into a
/// pre-indexed load / store when the base pointer is an add or subtract
/// and it has other uses besides the load / store. After the
@ -6070,10 +6108,9 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
if (N->hasPredecessorHelper(Use, Visited, Worklist))
return false;
if (!((Use->getOpcode() == ISD::LOAD &&
cast<LoadSDNode>(Use)->getBasePtr() == Ptr) ||
(Use->getOpcode() == ISD::STORE &&
cast<StoreSDNode>(Use)->getBasePtr() == Ptr)))
// If Ptr may be folded in addressing mode of other use, then it's
// not profitable to do this transformation.
if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
RealUse = true;
}
@ -6170,7 +6207,8 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
continue;
// Try turning it into a post-indexed load / store except when
// 1) All uses are load / store ops that use it as base ptr.
// 1) All uses are load / store ops that use it as base ptr (and
// it may be folded as addressing mmode).
// 2) Op must be independent of N, i.e. Op is neither a predecessor
// nor a successor of N. Otherwise, if Op is folded that would
// create a cycle.
@ -6193,10 +6231,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
for (SDNode::use_iterator III = Use->use_begin(),
EEE = Use->use_end(); III != EEE; ++III) {
SDNode *UseUse = *III;
if (!((UseUse->getOpcode() == ISD::LOAD &&
cast<LoadSDNode>(UseUse)->getBasePtr().getNode() == Use) ||
(UseUse->getOpcode() == ISD::STORE &&
cast<StoreSDNode>(UseUse)->getBasePtr().getNode() == Use)))
if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
RealUse = true;
}

View File

@ -54,12 +54,12 @@ declare i8* @malloc(...)
define fastcc void @test4(i16 %addr) nounwind {
entry:
; A8: test4:
; A8: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
; A8: str [[REG]], [r0, r1, lsl #2]
; A8: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
; A8: str [[REG]], [r0]
; A9: test4:
; A9: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]
; A9: str [[REG]], [r0, r1, lsl #2]
; A9: ldr [[REG:r[0-9]+]], [r0, r1, lsl #2]!
; A9: str [[REG]], [r0]
%0 = tail call i8* (...)* @malloc(i32 undef) nounwind
%1 = bitcast i8* %0 to i32*
%2 = sext i16 %addr to i32

View File

@ -0,0 +1,95 @@
; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a9 -stress-ivchain | FileCheck %s
; @sharedidx is an unrolled variant of this loop:
; for (unsigned long i = 0; i < len; i += s) {
; c[i] = a[i] + b[i];
; }
; where 's' cannot be folded into the addressing mode.
;
; This is not quite profitable to chain. But with -stress-ivchain, we
; can form three address chains in place of the shared induction
; variable.
; rdar://10674430
define void @sharedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c, i32 %s, i32 %len) nounwind ssp {
entry:
; CHECK: sharedidx:
%cmp8 = icmp eq i32 %len, 0
br i1 %cmp8, label %for.end, label %for.body
for.body: ; preds = %entry, %for.body.3
; CHECK: %for.body
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8* %a, i32 %i.09
%0 = load i8* %arrayidx, align 1
%conv6 = zext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8* %b, i32 %i.09
%1 = load i8* %arrayidx1, align 1
%conv27 = zext i8 %1 to i32
%add = add nsw i32 %conv27, %conv6
%conv3 = trunc i32 %add to i8
%arrayidx4 = getelementptr inbounds i8* %c, i32 %i.09
store i8 %conv3, i8* %arrayidx4, align 1
%add5 = add i32 %i.09, %s
%cmp = icmp ult i32 %add5, %len
br i1 %cmp, label %for.body.1, label %for.end
for.end: ; preds = %for.body, %for.body.1, %for.body.2, %for.body.3, %entry
ret void
for.body.1: ; preds = %for.body
; CHECK: %for.body.1
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
%arrayidx.1 = getelementptr inbounds i8* %a, i32 %add5
%2 = load i8* %arrayidx.1, align 1
%conv6.1 = zext i8 %2 to i32
%arrayidx1.1 = getelementptr inbounds i8* %b, i32 %add5
%3 = load i8* %arrayidx1.1, align 1
%conv27.1 = zext i8 %3 to i32
%add.1 = add nsw i32 %conv27.1, %conv6.1
%conv3.1 = trunc i32 %add.1 to i8
%arrayidx4.1 = getelementptr inbounds i8* %c, i32 %add5
store i8 %conv3.1, i8* %arrayidx4.1, align 1
%add5.1 = add i32 %add5, %s
%cmp.1 = icmp ult i32 %add5.1, %len
br i1 %cmp.1, label %for.body.2, label %for.end
for.body.2: ; preds = %for.body.1
; CHECK: %for.body.2
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
%arrayidx.2 = getelementptr inbounds i8* %a, i32 %add5.1
%4 = load i8* %arrayidx.2, align 1
%conv6.2 = zext i8 %4 to i32
%arrayidx1.2 = getelementptr inbounds i8* %b, i32 %add5.1
%5 = load i8* %arrayidx1.2, align 1
%conv27.2 = zext i8 %5 to i32
%add.2 = add nsw i32 %conv27.2, %conv6.2
%conv3.2 = trunc i32 %add.2 to i8
%arrayidx4.2 = getelementptr inbounds i8* %c, i32 %add5.1
store i8 %conv3.2, i8* %arrayidx4.2, align 1
%add5.2 = add i32 %add5.1, %s
%cmp.2 = icmp ult i32 %add5.2, %len
br i1 %cmp.2, label %for.body.3, label %for.end
for.body.3: ; preds = %for.body.2
; CHECK: %for.body.3
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
%arrayidx.3 = getelementptr inbounds i8* %a, i32 %add5.2
%6 = load i8* %arrayidx.3, align 1
%conv6.3 = zext i8 %6 to i32
%arrayidx1.3 = getelementptr inbounds i8* %b, i32 %add5.2
%7 = load i8* %arrayidx1.3, align 1
%conv27.3 = zext i8 %7 to i32
%add.3 = add nsw i32 %conv27.3, %conv6.3
%conv3.3 = trunc i32 %add.3 to i8
%arrayidx4.3 = getelementptr inbounds i8* %c, i32 %add5.2
store i8 %conv3.3, i8* %arrayidx4.3, align 1
%add5.3 = add i32 %add5.2, %s
%cmp.3 = icmp ult i32 %add5.3, %len
br i1 %cmp.3, label %for.body, label %for.end
}