From e1af5f6ad178e76429b58759042f061247d90435 Mon Sep 17 00:00:00 2001 From: Jim Grosbach Date: Fri, 22 Nov 2013 19:57:47 +0000 Subject: [PATCH] X86: Perform integer comparisons at i32 or larger. Utilizing the 8 and 16 bit comparison instructions, even when an input can be folded into the comparison instruction itself, is typically not worth it. There are too many partial register stalls as a result, leading to significant slowdowns. By always performing comparisons on at least 32-bit registers, performance of the calculation chain leading to the comparison improves. Continue to use the smaller comparisons when minimizing size, as that allows better folding of loads into the comparison instructions. rdar://15386341 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195496 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 29 +++++++ test/CodeGen/X86/2007-10-17-IllegalAsm.ll | 87 --------------------- test/CodeGen/X86/3addr-16bit.ll | 6 +- test/CodeGen/X86/codegen-prepare-extload.ll | 2 +- test/CodeGen/X86/ctpop-combine.ll | 2 +- test/CodeGen/X86/memcmp.ll | 22 +++--- test/CodeGen/X86/shrink-compare.ll | 8 +- 7 files changed, 50 insertions(+), 106 deletions(-) delete mode 100644 test/CodeGen/X86/2007-10-17-IllegalAsm.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6c8865915da..df2785c0675 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3419,6 +3419,24 @@ bool X86::isCalleePop(CallingConv::ID CallingConv, } } +/// \brief Return true if the condition is an unsigned comparison operation. +static bool isX86CCUnsigned(unsigned X86CC) { + switch (X86CC) { + default: llvm_unreachable("Invalid integer condition!"); + case X86::COND_E: return true; + case X86::COND_G: return false; + case X86::COND_GE: return false; + case X86::COND_L: return false; + case X86::COND_LE: return false; + case X86::COND_NE: return true; + case X86::COND_B: return true; + case X86::COND_A: return true; + case X86::COND_BE: return true; + case X86::COND_AE: return true; + } + llvm_unreachable("covered switch fell through?!"); +} + /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 /// specific condition code, returning the condition code and the LHS/RHS of the /// comparison to make. @@ -9662,6 +9680,17 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl(Op0); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { + // Do the comparison at i32 if it's smaller. This avoids subregister + // aliasing issues. Keep the smaller reference if we're optimizing for + // size, however, as that'll allow better folding of memory operations. + if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && + !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize)) { + unsigned ExtendOp = + isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); + Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); + } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, diff --git a/test/CodeGen/X86/2007-10-17-IllegalAsm.ll b/test/CodeGen/X86/2007-10-17-IllegalAsm.ll deleted file mode 100644 index c0bb55ed14e..00000000000 --- a/test/CodeGen/X86/2007-10-17-IllegalAsm.ll +++ /dev/null @@ -1,87 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep addb | not grep x -; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep cmpb | not grep x -; PR1734 - -target triple = "x86_64-unknown-linux-gnu" - %struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } - %struct.eh_status = type opaque - %struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack*, i32, %struct.location_t, i32, i8*, %struct.rtx_def** } - %struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* } - %struct.function = type { %struct.eh_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, %struct.initial_value_struct*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i8, i32, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.varray_head_tag*, %struct.temp_slot*, i32, %struct.var_refs_queue*, i32, i32, %struct.rtvec_def*, %struct.tree_node*, i32, i32, i32, %struct.machine_function*, i32, i32, i8, i8, %struct.language_function*, %struct.rtx_def*, i32, i32, i32, i32, %struct.location_t, %struct.varray_head_tag*, %struct.tree_node*, %struct.tree_node*, i8, i8, i8 } - %struct.initial_value_struct = type opaque - %struct.lang_decl = type opaque - %struct.language_function = type opaque - %struct.location_t = type { i8*, i32 } - %struct.machine_function = type { %struct.stack_local_entry*, i8*, %struct.rtx_def*, i32, i32, i32, i32, i32 } - %struct.rtunion = type { i8* } - %struct.rtvec_def = type { i32, [1 x %struct.rtx_def*] } - %struct.rtx_def = type { i16, i8, i8, %struct.u } - %struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack* } - %struct.stack_local_entry = type opaque - %struct.temp_slot = type opaque - %struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, %union.tree_ann_d*, i8, i8, i8, i8, i8 } - %struct.tree_decl = type { %struct.tree_common, %struct.location_t, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, i8, i8, i32, %struct.tree_decl_u1, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.tree_decl_u2, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* } - %struct.tree_decl_u1 = type { i64 } - %struct.tree_decl_u2 = type { %struct.function* } - %struct.tree_node = type { %struct.tree_decl } - %struct.u = type { [1 x %struct.rtunion] } - %struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* } - %struct.varasm_status = type opaque - %struct.varray_data = type { [1 x i64] } - %struct.varray_head_tag = type { i64, i64, i32, i8*, %struct.varray_data } - %union.tree_ann_d = type opaque - -define void @layout_type(%struct.tree_node* %type) { -entry: - %tmp32 = load i32* null, align 8 ; [#uses=3] - %tmp3435 = trunc i32 %tmp32 to i8 ; [#uses=1] - %tmp53 = icmp eq %struct.tree_node* null, null ; [#uses=1] - br i1 %tmp53, label %cond_next57, label %UnifiedReturnBlock - -cond_next57: ; preds = %entry - %tmp65 = and i32 %tmp32, 255 ; [#uses=1] - switch i32 %tmp65, label %UnifiedReturnBlock [ - i32 6, label %bb140 - i32 7, label %bb140 - i32 8, label %bb140 - i32 13, label %bb478 - ] - -bb140: ; preds = %cond_next57, %cond_next57, %cond_next57 - %tmp219 = load i32* null, align 8 ; [#uses=1] - %tmp221222 = trunc i32 %tmp219 to i8 ; [#uses=1] - %tmp223 = icmp eq i8 %tmp221222, 24 ; [#uses=1] - br i1 %tmp223, label %cond_true226, label %cond_next340 - -cond_true226: ; preds = %bb140 - switch i8 %tmp3435, label %cond_true288 [ - i8 6, label %cond_next340 - i8 9, label %cond_next340 - i8 7, label %cond_next340 - i8 8, label %cond_next340 - i8 10, label %cond_next340 - ] - -cond_true288: ; preds = %cond_true226 - unreachable - -cond_next340: ; preds = %cond_true226, %cond_true226, %cond_true226, %cond_true226, %cond_true226, %bb140 - ret void - -bb478: ; preds = %cond_next57 - br i1 false, label %cond_next500, label %cond_true497 - -cond_true497: ; preds = %bb478 - unreachable - -cond_next500: ; preds = %bb478 - %tmp513 = load i32* null, align 8 ; [#uses=1] - %tmp545 = and i32 %tmp513, 8192 ; [#uses=1] - %tmp547 = and i32 %tmp32, -8193 ; [#uses=1] - %tmp548 = or i32 %tmp547, %tmp545 ; [#uses=1] - store i32 %tmp548, i32* null, align 8 - ret void - -UnifiedReturnBlock: ; preds = %cond_next57, %entry - ret void -} diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll index fafdfdb7481..2d6a5e76657 100644 --- a/test/CodeGen/X86/3addr-16bit.ll +++ b/test/CodeGen/X86/3addr-16bit.ll @@ -34,7 +34,7 @@ entry: ; 64BIT-LABEL: t2: ; 64BIT-NOT: movw %si, %ax -; 64BIT: decl %eax +; 64BIT: leal -1(%rsi), %eax ; 64BIT: movzwl %ax %0 = icmp eq i16 %k, %c ; [#uses=1] %1 = add i16 %k, -1 ; [#uses=3] @@ -59,7 +59,7 @@ entry: ; 64BIT-LABEL: t3: ; 64BIT-NOT: movw %si, %ax -; 64BIT: addl $2, %eax +; 64BIT: leal 2(%rsi), %eax %0 = add i16 %k, 2 ; [#uses=3] %1 = icmp eq i16 %k, %c ; [#uses=1] br i1 %1, label %bb, label %bb1 @@ -82,7 +82,7 @@ entry: ; 64BIT-LABEL: t4: ; 64BIT-NOT: movw %si, %ax -; 64BIT: addl %edi, %eax +; 64BIT: leal (%rsi,%rdi), %eax %0 = add i16 %k, %c ; [#uses=3] %1 = icmp eq i16 %k, %c ; [#uses=1] br i1 %1, label %bb, label %bb1 diff --git a/test/CodeGen/X86/codegen-prepare-extload.ll b/test/CodeGen/X86/codegen-prepare-extload.ll index 14df815663e..9320706d972 100644 --- a/test/CodeGen/X86/codegen-prepare-extload.ll +++ b/test/CodeGen/X86/codegen-prepare-extload.ll @@ -5,7 +5,7 @@ ; CodeGenPrepare should move the zext into the block with the load ; so that SelectionDAG can select it with the load. -; CHECK: movzbl ({{%rdi|%rcx}}), %eax +; CHECK: movsbl ({{%rdi|%rcx}}), %eax define void @foo(i8* %p, i32* %q) { entry: diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll index 786f7f9b1cc..463505bd95d 100644 --- a/test/CodeGen/X86/ctpop-combine.ll +++ b/test/CodeGen/X86/ctpop-combine.ll @@ -35,6 +35,6 @@ define i32 @test3(i64 %x) nounwind readnone { %conv = zext i1 %cmp to i32 ret i32 %conv ; CHECK-LABEL: test3: -; CHECK: cmpb $2 +; CHECK: cmpl $2 ; CHECK: ret } diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index cb0797d3eb3..0a534926c6c 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -22,8 +22,9 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK-LABEL: memcmp2: -; CHECK: movw ([[A0:%rdi|%rcx]]), %ax -; CHECK: cmpw ([[A1:%rsi|%rdx]]), %ax +; CHECK: movzwl +; CHECK-NEXT: movzwl +; CHECK-NEXT: cmpl ; NOBUILTIN-LABEL: memcmp2: ; NOBUILTIN: callq } @@ -41,7 +42,8 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK-LABEL: memcmp2a: -; CHECK: cmpw $28527, ([[A0]]) +; CHECK: movzwl +; CHECK-NEXT: cmpl $28527, } @@ -58,8 +60,8 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK-LABEL: memcmp4: -; CHECK: movl ([[A0]]), %eax -; CHECK: cmpl ([[A1]]), %eax +; CHECK: movl +; CHECK-NEXT: cmpl } define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind { @@ -75,7 +77,7 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK-LABEL: memcmp4a: -; CHECK: cmpl $1869573999, ([[A0]]) +; CHECK: cmpl $1869573999, } define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind { @@ -91,8 +93,8 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK-LABEL: memcmp8: -; CHECK: movq ([[A0]]), %rax -; CHECK: cmpq ([[A1]]), %rax +; CHECK: movq +; CHECK: cmpq } define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind { @@ -108,7 +110,7 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK-LABEL: memcmp8a: -; CHECK: movabsq $8029759185026510694, %rax -; CHECK: cmpq %rax, ([[A0]]) +; CHECK: movabsq $8029759185026510694, +; CHECK: cmpq } diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll index bb892011e2d..fc7ee061f35 100644 --- a/test/CodeGen/X86/shrink-compare.ll +++ b/test/CodeGen/X86/shrink-compare.ll @@ -2,7 +2,7 @@ declare void @bar() -define void @test1(i32* nocapture %X) nounwind { +define void @test1(i32* nocapture %X) nounwind minsize { entry: %tmp1 = load i32* %X, align 4 %and = and i32 %tmp1, 255 @@ -19,7 +19,7 @@ if.end: ; CHECK: cmpb $47, (%{{rdi|rcx}}) } -define void @test2(i32 %X) nounwind { +define void @test2(i32 %X) nounwind minsize { entry: %and = and i32 %X, 255 %cmp = icmp eq i32 %and, 47 @@ -35,7 +35,7 @@ if.end: ; CHECK: cmpb $47, %{{dil|cl}} } -define void @test3(i32 %X) nounwind { +define void @test3(i32 %X) nounwind minsize { entry: %and = and i32 %X, 255 %cmp = icmp eq i32 %and, 255 @@ -70,7 +70,7 @@ lor.end: ; preds = %lor.rhs, %entry @x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4 ; PR16551 -define void @test5(i32 %X) nounwind { +define void @test5(i32 %X) nounwind minsize { entry: %bf.load = load i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4 %bf.lshr = lshr i56 %bf.load, 32