Micro-optimization:

This code:

float floatingPointComparison(float x, float y) {
    double product = (double)x * y;
    if (product == 0.0)
        return product;
    return product - 1.0;
}

produces this:

_floatingPointComparison:
0000000000000000        cvtss2sd        %xmm1,%xmm1
0000000000000004        cvtss2sd        %xmm0,%xmm0
0000000000000008        mulsd           %xmm1,%xmm0
000000000000000c        pxor            %xmm1,%xmm1
0000000000000010        ucomisd         %xmm1,%xmm0
0000000000000014        jne             0x00000004
0000000000000016        jp              0x00000002
0000000000000018        jmp             0x00000008
000000000000001a        addsd           0x00000006(%rip),%xmm0
0000000000000022        cvtsd2ss        %xmm0,%xmm0
0000000000000026        ret

The "jne/jp/jmp" sequence can be reduced to this instead:

_floatingPointComparison:
0000000000000000        cvtss2sd        %xmm1,%xmm1
0000000000000004        cvtss2sd        %xmm0,%xmm0
0000000000000008        mulsd           %xmm1,%xmm0
000000000000000c        pxor            %xmm1,%xmm1
0000000000000010        ucomisd         %xmm1,%xmm0
0000000000000014        jp              0x00000002
0000000000000016        je              0x00000008
0000000000000018        addsd           0x00000006(%rip),%xmm0
0000000000000020        cvtsd2ss        %xmm0,%xmm0
0000000000000024        ret

for a savings of 2 bytes.

This xform can happen when we recognize that jne and jp jump to the same "true"
MBB, the unconditional jump would jump to the "false" MBB, and the "true" branch
is the fall-through MBB.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@97766 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bill Wendling 2010-03-05 00:24:26 +00:00
parent b0812f114b
commit 37b52ee6d9
2 changed files with 74 additions and 13 deletions

View File

@ -1786,6 +1786,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
const SmallVectorImpl<MachineOperand> &Cond) const {
// FIXME this should probably have a DebugLoc operand
DebugLoc dl = DebugLoc::getUnknownLoc();
// Shouldn't be a fall through.
assert(TBB && "InsertBranch must not be told to insert a fallthrough");
assert((Cond.size() == 1 || Cond.size() == 0) &&
@ -1799,34 +1800,72 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
}
// Conditional branch.
const MachineBasicBlock *NextBB = next(&MBB);
unsigned Count = 0;
X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
// In a two-way conditional branch, if the fall-through block is the
// "false" branch of the conditional jumps, we can cut out the
// unconditional jump by rearranging the conditional jumps. This saves a
// few bytes and improves performance. I.e., for COND_NE_OR_P:
//
// JNE L1
// JP L1
// JMP L2
// L1:
// ...
// L2:
// ...
//
// to:
//
// JP L1
// JE L2
// L1:
// ...
// L2:
// ...
//
// Similarly for COND_NP_OR_E.
switch (CC) {
default:
BuildMI(&MBB, dl, get(GetCondBranchFromCond(CC))).addMBB(TBB);
++Count;
break;
case X86::COND_NP_OR_E:
// Synthesize NP_OR_E with two branches.
BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
++Count;
BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(TBB);
++Count;
if (FBB && FBB == NextBB) {
BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(FBB);
FBB = 0;
} else {
BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(TBB);
}
Count += 2;
break;
case X86::COND_NE_OR_P:
// Synthesize NE_OR_P with two branches.
BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(TBB);
++Count;
BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
++Count;
if (FBB && FBB == NextBB) {
BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(FBB);
FBB = 0;
} else {
BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(TBB);
BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
}
Count += 2;
break;
default: {
unsigned Opc = GetCondBranchFromCond(CC);
BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
++Count;
}
}
if (FBB) {
// Two-way Conditional branch. Insert the second branch.
BuildMI(&MBB, dl, get(X86::JMP_4)).addMBB(FBB);
++Count;
}
return Count;
}

View File

@ -0,0 +1,22 @@
; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
; <rdar://problem/7598384>
define float @test1(float %x, float %y) nounwind readnone optsize ssp {
; CHECK: jp
; CHECK-NEXT: je
entry:
%0 = fpext float %x to double
%1 = fpext float %y to double
%2 = fmul double %0, %1
%3 = fcmp oeq double %2, 0.000000e+00
br i1 %3, label %bb2, label %bb1
bb1:
%4 = fadd double %2, -1.000000e+00
br label %bb2
bb2:
%.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
%.0 = fptrunc double %.0.in to float
ret float %.0
}