X86: Perform integer comparisons at i32 or larger.

Utilizing the 8 and 16 bit comparison instructions, even when an input can
be folded into the comparison instruction itself, is typically not worth it.
There are too many partial register stalls as a result, leading to significant
slowdowns. By always performing comparisons on at least 32-bit
registers, performance of the calculation chain leading to the
comparison improves. Continue to use the smaller comparisons when
minimizing size, as that allows better folding of loads into the
comparison instructions.

rdar://15386341

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195496 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Jim Grosbach 2013-11-22 19:57:47 +00:00
parent 22bc1320b5
commit e1af5f6ad1
7 changed files with 50 additions and 106 deletions

View File

@ -3419,6 +3419,24 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
}
}
/// \brief Return true if the condition is an unsigned comparison operation.
static bool isX86CCUnsigned(unsigned X86CC) {
switch (X86CC) {
default: llvm_unreachable("Invalid integer condition!");
case X86::COND_E: return true;
case X86::COND_G: return false;
case X86::COND_GE: return false;
case X86::COND_L: return false;
case X86::COND_LE: return false;
case X86::COND_NE: return true;
case X86::COND_B: return true;
case X86::COND_A: return true;
case X86::COND_BE: return true;
case X86::COND_AE: return true;
}
llvm_unreachable("covered switch fell through?!");
}
/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
@ -9662,6 +9680,17 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
SDLoc dl(Op0);
if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
// Do the comparison at i32 if it's smaller. This avoids subregister
// aliasing issues. Keep the smaller reference if we're optimizing for
// size, however, as that'll allow better folding of memory operations.
if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
!DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
AttributeSet::FunctionIndex, Attribute::MinSize)) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
}
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,

View File

@ -1,87 +0,0 @@
; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep addb | not grep x
; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep cmpb | not grep x
; PR1734
target triple = "x86_64-unknown-linux-gnu"
%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
%struct.eh_status = type opaque
%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack*, i32, %struct.location_t, i32, i8*, %struct.rtx_def** }
%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
%struct.function = type { %struct.eh_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, %struct.initial_value_struct*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i8, i32, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.varray_head_tag*, %struct.temp_slot*, i32, %struct.var_refs_queue*, i32, i32, %struct.rtvec_def*, %struct.tree_node*, i32, i32, i32, %struct.machine_function*, i32, i32, i8, i8, %struct.language_function*, %struct.rtx_def*, i32, i32, i32, i32, %struct.location_t, %struct.varray_head_tag*, %struct.tree_node*, %struct.tree_node*, i8, i8, i8 }
%struct.initial_value_struct = type opaque
%struct.lang_decl = type opaque
%struct.language_function = type opaque
%struct.location_t = type { i8*, i32 }
%struct.machine_function = type { %struct.stack_local_entry*, i8*, %struct.rtx_def*, i32, i32, i32, i32, i32 }
%struct.rtunion = type { i8* }
%struct.rtvec_def = type { i32, [1 x %struct.rtx_def*] }
%struct.rtx_def = type { i16, i8, i8, %struct.u }
%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack* }
%struct.stack_local_entry = type opaque
%struct.temp_slot = type opaque
%struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, %union.tree_ann_d*, i8, i8, i8, i8, i8 }
%struct.tree_decl = type { %struct.tree_common, %struct.location_t, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, i8, i8, i32, %struct.tree_decl_u1, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.tree_decl_u2, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* }
%struct.tree_decl_u1 = type { i64 }
%struct.tree_decl_u2 = type { %struct.function* }
%struct.tree_node = type { %struct.tree_decl }
%struct.u = type { [1 x %struct.rtunion] }
%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
%struct.varasm_status = type opaque
%struct.varray_data = type { [1 x i64] }
%struct.varray_head_tag = type { i64, i64, i32, i8*, %struct.varray_data }
%union.tree_ann_d = type opaque
define void @layout_type(%struct.tree_node* %type) {
entry:
%tmp32 = load i32* null, align 8 ; <i32> [#uses=3]
%tmp3435 = trunc i32 %tmp32 to i8 ; <i8> [#uses=1]
%tmp53 = icmp eq %struct.tree_node* null, null ; <i1> [#uses=1]
br i1 %tmp53, label %cond_next57, label %UnifiedReturnBlock
cond_next57: ; preds = %entry
%tmp65 = and i32 %tmp32, 255 ; <i32> [#uses=1]
switch i32 %tmp65, label %UnifiedReturnBlock [
i32 6, label %bb140
i32 7, label %bb140
i32 8, label %bb140
i32 13, label %bb478
]
bb140: ; preds = %cond_next57, %cond_next57, %cond_next57
%tmp219 = load i32* null, align 8 ; <i32> [#uses=1]
%tmp221222 = trunc i32 %tmp219 to i8 ; <i8> [#uses=1]
%tmp223 = icmp eq i8 %tmp221222, 24 ; <i1> [#uses=1]
br i1 %tmp223, label %cond_true226, label %cond_next340
cond_true226: ; preds = %bb140
switch i8 %tmp3435, label %cond_true288 [
i8 6, label %cond_next340
i8 9, label %cond_next340
i8 7, label %cond_next340
i8 8, label %cond_next340
i8 10, label %cond_next340
]
cond_true288: ; preds = %cond_true226
unreachable
cond_next340: ; preds = %cond_true226, %cond_true226, %cond_true226, %cond_true226, %cond_true226, %bb140
ret void
bb478: ; preds = %cond_next57
br i1 false, label %cond_next500, label %cond_true497
cond_true497: ; preds = %bb478
unreachable
cond_next500: ; preds = %bb478
%tmp513 = load i32* null, align 8 ; <i32> [#uses=1]
%tmp545 = and i32 %tmp513, 8192 ; <i32> [#uses=1]
%tmp547 = and i32 %tmp32, -8193 ; <i32> [#uses=1]
%tmp548 = or i32 %tmp547, %tmp545 ; <i32> [#uses=1]
store i32 %tmp548, i32* null, align 8
ret void
UnifiedReturnBlock: ; preds = %cond_next57, %entry
ret void
}

View File

@ -34,7 +34,7 @@ entry:
; 64BIT-LABEL: t2:
; 64BIT-NOT: movw %si, %ax
; 64BIT: decl %eax
; 64BIT: leal -1(%rsi), %eax
; 64BIT: movzwl %ax
%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
%1 = add i16 %k, -1 ; <i16> [#uses=3]
@ -59,7 +59,7 @@ entry:
; 64BIT-LABEL: t3:
; 64BIT-NOT: movw %si, %ax
; 64BIT: addl $2, %eax
; 64BIT: leal 2(%rsi), %eax
%0 = add i16 %k, 2 ; <i16> [#uses=3]
%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
br i1 %1, label %bb, label %bb1
@ -82,7 +82,7 @@ entry:
; 64BIT-LABEL: t4:
; 64BIT-NOT: movw %si, %ax
; 64BIT: addl %edi, %eax
; 64BIT: leal (%rsi,%rdi), %eax
%0 = add i16 %k, %c ; <i16> [#uses=3]
%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
br i1 %1, label %bb, label %bb1

View File

@ -5,7 +5,7 @@
; CodeGenPrepare should move the zext into the block with the load
; so that SelectionDAG can select it with the load.
; CHECK: movzbl ({{%rdi|%rcx}}), %eax
; CHECK: movsbl ({{%rdi|%rcx}}), %eax
define void @foo(i8* %p, i32* %q) {
entry:

View File

@ -35,6 +35,6 @@ define i32 @test3(i64 %x) nounwind readnone {
%conv = zext i1 %cmp to i32
ret i32 %conv
; CHECK-LABEL: test3:
; CHECK: cmpb $2
; CHECK: cmpl $2
; CHECK: ret
}

View File

@ -22,8 +22,9 @@ bb: ; preds = %entry
return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp2:
; CHECK: movw ([[A0:%rdi|%rcx]]), %ax
; CHECK: cmpw ([[A1:%rsi|%rdx]]), %ax
; CHECK: movzwl
; CHECK-NEXT: movzwl
; CHECK-NEXT: cmpl
; NOBUILTIN-LABEL: memcmp2:
; NOBUILTIN: callq
}
@ -41,7 +42,8 @@ bb: ; preds = %entry
return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp2a:
; CHECK: cmpw $28527, ([[A0]])
; CHECK: movzwl
; CHECK-NEXT: cmpl $28527,
}
@ -58,8 +60,8 @@ bb: ; preds = %entry
return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp4:
; CHECK: movl ([[A0]]), %eax
; CHECK: cmpl ([[A1]]), %eax
; CHECK: movl
; CHECK-NEXT: cmpl
}
define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind {
@ -75,7 +77,7 @@ bb: ; preds = %entry
return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp4a:
; CHECK: cmpl $1869573999, ([[A0]])
; CHECK: cmpl $1869573999,
}
define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
@ -91,8 +93,8 @@ bb: ; preds = %entry
return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp8:
; CHECK: movq ([[A0]]), %rax
; CHECK: cmpq ([[A1]]), %rax
; CHECK: movq
; CHECK: cmpq
}
define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind {
@ -108,7 +110,7 @@ bb: ; preds = %entry
return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp8a:
; CHECK: movabsq $8029759185026510694, %rax
; CHECK: cmpq %rax, ([[A0]])
; CHECK: movabsq $8029759185026510694,
; CHECK: cmpq
}

View File

@ -2,7 +2,7 @@
declare void @bar()
define void @test1(i32* nocapture %X) nounwind {
define void @test1(i32* nocapture %X) nounwind minsize {
entry:
%tmp1 = load i32* %X, align 4
%and = and i32 %tmp1, 255
@ -19,7 +19,7 @@ if.end:
; CHECK: cmpb $47, (%{{rdi|rcx}})
}
define void @test2(i32 %X) nounwind {
define void @test2(i32 %X) nounwind minsize {
entry:
%and = and i32 %X, 255
%cmp = icmp eq i32 %and, 47
@ -35,7 +35,7 @@ if.end:
; CHECK: cmpb $47, %{{dil|cl}}
}
define void @test3(i32 %X) nounwind {
define void @test3(i32 %X) nounwind minsize {
entry:
%and = and i32 %X, 255
%cmp = icmp eq i32 %and, 255
@ -70,7 +70,7 @@ lor.end: ; preds = %lor.rhs, %entry
@x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4
; PR16551
define void @test5(i32 %X) nounwind {
define void @test5(i32 %X) nounwind minsize {
entry:
%bf.load = load i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
%bf.lshr = lshr i56 %bf.load, 32