From 04b091a7822c60512285ee66b058f98399bf1cf9 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 24 Dec 2009 01:07:17 +0000 Subject: [PATCH] handle equality memcmp of 8 bytes on x86-64 with two unaligned loads and a compare. On other targets we end up with a call to memcmp because we don't want 16 individual byte loads. We should be able to use movups as well, but we're failing to select the generated icmp. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@92107 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/SelectionDAGBuilder.cpp | 74 ++++++++++++++----- test/CodeGen/X86/memcmp.ll | 42 ++++++++++- 2 files changed, 93 insertions(+), 23 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e194003da93..8fe7c455450 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5092,17 +5092,8 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { return true; } -static SDValue getMemCmpLoad(Value *PtrVal, unsigned Size, +static SDValue getMemCmpLoad(Value *PtrVal, MVT LoadVT, const Type *LoadTy, SelectionDAGBuilder &Builder) { - MVT LoadVT; - const Type *LoadTy; - if (Size == 2) { - LoadVT = MVT::i16; - LoadTy = Type::getInt16Ty(PtrVal->getContext()); - } else { - LoadVT = MVT::i32; - LoadTy = Type::getInt32Ty(PtrVal->getContext()); - } // Check to see if this load can be trivially constant folded, e.g. if the // input is from a string literal. @@ -5158,16 +5149,61 @@ bool SelectionDAGBuilder::visitMemCmpCall(CallInst &I) { // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 - if (Size && (Size->getValue() == 2 || Size->getValue() == 4) && - IsOnlyUsedInZeroEqualityComparison(&I)) { - SDValue LHSVal = getMemCmpLoad(LHS, Size->getZExtValue(), *this); - SDValue RHSVal = getMemCmpLoad(RHS, Size->getZExtValue(), *this); + if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) { + bool ActuallyDoIt = true; + MVT LoadVT; + const Type *LoadTy; + switch (Size->getZExtValue()) { + default: + LoadVT = MVT::Other; + LoadTy = 0; + ActuallyDoIt = false; + break; + case 2: + LoadVT = MVT::i16; + LoadTy = Type::getInt16Ty(Size->getContext()); + break; + case 4: + LoadVT = MVT::i32; + LoadTy = Type::getInt32Ty(Size->getContext()); + break; + case 8: + LoadVT = MVT::i64; + LoadTy = Type::getInt64Ty(Size->getContext()); + break; + /* + case 16: + LoadVT = MVT::v4i32; + LoadTy = Type::getInt32Ty(Size->getContext()); + LoadTy = VectorType::get(LoadTy, 4); + break; + */ + } - SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal, - ISD::SETNE); - EVT CallVT = TLI.getValueType(I.getType(), true); - setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT)); - return true; + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. + + // Require that we can find a legal MVT, and only do this if the target + // supports unaligned loads of that type. Expanding into byte loads would + // bloat the code. + if (ActuallyDoIt && Size->getZExtValue() > 4) { + // TODO: Handle 5 byte compare as 4-byte + 1 byte. + // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. + if (!TLI.isTypeLegal(LoadVT) ||!TLI.allowsUnalignedMemoryAccesses(LoadVT)) + ActuallyDoIt = false; + } + + if (ActuallyDoIt) { + SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this); + SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this); + + SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal, + ISD::SETNE); + EVT CallVT = TLI.getValueType(I.getType(), true); + setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT)); + return true; + } } diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index f6086c4fbe2..b90d2e21187 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -3,7 +3,7 @@ ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 -@.str = private constant [6 x i8] c"fooxx\00", align 1 ; <[6 x i8]*> [#uses=1] +@.str = private constant [23 x i8] c"fooooooooooooooooooooo\00", align 1 ; <[23 x i8]*> [#uses=1] declare i32 @memcmp(...) @@ -26,7 +26,7 @@ return: ; preds = %entry define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind { entry: - %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; [#uses=1] + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; [#uses=1] %1 = icmp eq i32 %0, 0 ; [#uses=1] br i1 %1, label %return, label %bb @@ -60,7 +60,7 @@ return: ; preds = %entry define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind { entry: - %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; [#uses=1] + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; [#uses=1] %1 = icmp eq i32 %0, 0 ; [#uses=1] br i1 %1, label %return, label %bb @@ -71,6 +71,40 @@ bb: ; preds = %entry return: ; preds = %entry ret void ; CHECK: memcmp4a: -; CHECK: cmpl $2021158767, (%rdi) +; CHECK: cmpl $1869573999, (%rdi) +} + +define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind { +entry: + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 8) nounwind ; [#uses=1] + %1 = icmp eq i32 %0, 0 ; [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %entry + store i32 4, i32* %P, align 4 + ret void + +return: ; preds = %entry + ret void +; CHECK: memcmp8: +; CHECK: movq (%rsi), %rax +; CHECK: cmpq %rax, (%rdi) +} + +define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind { +entry: + %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8]* @.str, i32 0, i32 0), i32 8) nounwind ; [#uses=1] + %1 = icmp eq i32 %0, 0 ; [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %entry + store i32 4, i32* %P, align 4 + ret void + +return: ; preds = %entry + ret void +; CHECK: memcmp8a: +; CHECK: movabsq $8029759185026510694, %rax +; CHECK: cmpq %rax, (%rdi) }