CodeGenPrep: sink extends of illegal types into use block.

Summary:
This helps the instruction selector to lower an i64 * i64 -> i128
multiplication into a single instruction on targets which support it.

This is an update of D2973 which was reverted because of a bug reported
as PR19084.

Reviewers: t.p.northover, chapuni

Reviewed By: t.p.northover

CC: llvm-commits, alex, chapuni

Differential Revision: http://llvm-reviews.chandlerc.com/D3021

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203797 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Manuel Jacob 2014-03-13 13:36:25 +00:00
parent 2da418712c
commit f8909fa140
4 changed files with 145 additions and 84 deletions

View File

@ -464,40 +464,8 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
}
/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
/// sink it into user blocks to reduce the number of virtual
/// registers that must be created and coalesced.
///
/// Return true if any changes are made.
///
static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
// If this is a noop copy,
EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
EVT DstVT = TLI.getValueType(CI->getType());
// This is an fp<->int conversion?
if (SrcVT.isInteger() != DstVT.isInteger())
return false;
// If this is an extension, it will be a zero or sign extension, which
// isn't a noop.
if (SrcVT.bitsLT(DstVT)) return false;
// If these values will be promoted, find out what they will be promoted
// to. This helps us consider truncates on PPC as noop copies when they
// are.
if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
TargetLowering::TypePromoteInteger)
SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
if (TLI.getTypeAction(CI->getContext(), DstVT) ==
TargetLowering::TypePromoteInteger)
DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
// If, after promotion, these are the same types, this is a noop copy.
if (SrcVT != DstVT)
return false;
/// SinkCast - Sink the specified cast instruction into its user blocks
static bool SinkCast(CastInst *CI) {
BasicBlock *DefBB = CI->getParent();
/// InsertedCasts - Only insert a cast in each block once.
@ -547,6 +515,43 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
return MadeChange;
}
/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
/// sink it into user blocks to reduce the number of virtual
/// registers that must be created and coalesced.
///
/// Return true if any changes are made.
///
static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
// If this is a noop copy,
EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
EVT DstVT = TLI.getValueType(CI->getType());
// This is an fp<->int conversion?
if (SrcVT.isInteger() != DstVT.isInteger())
return false;
// If this is an extension, it will be a zero or sign extension, which
// isn't a noop.
if (SrcVT.bitsLT(DstVT)) return false;
// If these values will be promoted, find out what they will be promoted
// to. This helps us consider truncates on PPC as noop copies when they
// are.
if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
TargetLowering::TypePromoteInteger)
SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
if (TLI.getTypeAction(CI->getContext(), DstVT) ==
TargetLowering::TypePromoteInteger)
DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
// If, after promotion, these are the same types, this is a noop copy.
if (SrcVT != DstVT)
return false;
return SinkCast(CI);
}
/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
/// the number of virtual registers that must be created and coalesced. This is
/// a clear win except on targets with multiple condition code registers
@ -2811,8 +2816,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
return true;
if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
bool MadeChange = MoveExtToFormExtLoad(I);
return MadeChange | OptimizeExtUses(I);
/// Sink a zext or sext into its user blocks if the target type doesn't
/// fit in one register
if (TLI && TLI->getTypeAction(CI->getContext(),
TLI->getValueType(CI->getType())) ==
TargetLowering::TypeExpandInteger) {
return SinkCast(CI);
} else {
bool MadeChange = MoveExtToFormExtLoad(I);
return MadeChange | OptimizeExtUses(I);
}
}
return false;
}

View File

@ -1444,54 +1444,6 @@ it would be nice to produce "into" someday.
//===---------------------------------------------------------------------===//
This code:
void vec_mpys1(int y[], const int x[], int scaler) {
int i;
for (i = 0; i < 150; i++)
y[i] += (((long long)scaler * (long long)x[i]) >> 31);
}
Compiles to this loop with GCC 3.x:
.L5:
movl %ebx, %eax
imull (%edi,%ecx,4)
shrdl $31, %edx, %eax
addl %eax, (%esi,%ecx,4)
incl %ecx
cmpl $149, %ecx
jle .L5
llvm-gcc compiles it to the much uglier:
LBB1_1: ## bb1
movl 24(%esp), %eax
movl (%eax,%edi,4), %ebx
movl %ebx, %ebp
imull %esi, %ebp
movl %ebx, %eax
mull %ecx
addl %ebp, %edx
sarl $31, %ebx
imull %ecx, %ebx
addl %edx, %ebx
shldl $1, %eax, %ebx
movl 20(%esp), %eax
addl %ebx, (%eax,%edi,4)
incl %edi
cmpl $150, %edi
jne LBB1_1 ## bb1
The issue is that we hoist the cast of "scaler" to long long outside of the
loop, the value comes into the loop as two values, and
RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
constructed BUILD_PAIR which represents the cast value.
This can be handled by making CodeGenPrepare sink the cast.
//===---------------------------------------------------------------------===//
Test instructions can be eliminated by using EFLAGS values from arithmetic
instructions. This is currently not done for mul, and, or, xor, neg, shl,
sra, srl, shld, shrd, atomic ops, and others. It is also currently not done

View File

@ -0,0 +1,32 @@
; RUN: llc < %s -march=x86-64 | FileCheck %s
define void @test(i64* nocapture %arr, i64 %arrsize, i64 %factor) nounwind uwtable {
%1 = icmp sgt i64 %arrsize, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%2 = sext i64 %factor to i128
br label %3
; <label>:3 ; preds = %3, %.lr.ph
; CHECK-NOT: mul
; CHECK: imulq
; CHECK-NOT: mul
%carry.02 = phi i128 [ 0, %.lr.ph ], [ %10, %3 ]
%i.01 = phi i64 [ 0, %.lr.ph ], [ %11, %3 ]
%4 = getelementptr inbounds i64* %arr, i64 %i.01
%5 = load i64* %4, align 8
%6 = sext i64 %5 to i128
%7 = mul nsw i128 %6, %2
%8 = add nsw i128 %7, %carry.02
%.tr = trunc i128 %8 to i64
%9 = and i64 %.tr, 9223372036854775807
store i64 %9, i64* %4, align 8
%10 = ashr i128 %8, 63
%11 = add nsw i64 %i.01, 1
%exitcond = icmp eq i64 %11, %arrsize
br i1 %exitcond, label %._crit_edge, label %3
._crit_edge: ; preds = %3, %0
ret void
}

View File

@ -0,0 +1,64 @@
; RUN: opt -codegenprepare -disable-cgp-branch-opts -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; The first cast should be sunk into block2, in order that the
; instruction selector can form an efficient
; i64 * i64 -> i128 multiplication.
define i128 @sink(i64* %mem1, i64* %mem2) {
; CHECK-LABEL: block1:
; CHECK-NEXT: load
block1:
%l1 = load i64* %mem1
%s1 = sext i64 %l1 to i128
br label %block2
; CHECK-LABEL: block2:
; CHECK-NEXT: sext
; CHECK-NEXT: load
; CHECK-NEXT: sext
block2:
%l2 = load i64* %mem2
%s2 = sext i64 %l2 to i128
%res = mul i128 %s1, %s2
ret i128 %res
}
; The first cast should be hoisted into block1, in order that the
; instruction selector can form an extend-load.
define i64 @hoist(i32* %mem1, i32* %mem2) {
; CHECK-LABEL: block1:
; CHECK-NEXT: load
; CHECK-NEXT: sext
block1:
%l1 = load i32* %mem1
br label %block2
; CHECK-LABEL: block2:
; CHECK-NEXT: load
; CHECK-NEXT: sext
block2:
%s1 = sext i32 %l1 to i64
%l2 = load i32* %mem2
%s2 = sext i32 %l2 to i64
%res = mul i64 %s1, %s2
ret i64 %res
}
; Make sure the cast sink logic and OptimizeExtUses don't end up in an infinite
; loop.
define i128 @use_ext_source() {
block1:
%v1 = or i64 undef, undef
%v2 = zext i64 %v1 to i128
br i1 undef, label %block2, label %block3
block2:
%v3 = add i64 %v1, 1
%v4 = zext i64 %v3 to i128
br label %block3
block3:
%res = phi i128 [ %v2, %block1 ], [ %v4, %block2 ]
ret i128 %res
}