From 8b944d39b356135676459152385f05c496951f6c Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 28 May 2009 00:35:15 +0000 Subject: [PATCH] Added optimization that narrow load / op / store and the 'op' is a bit twiddling instruction and its second operand is an immediate. If bits that are touched by 'op' can be done with a narrower instruction, reduce the width of the load and store as well. This happens a lot with bitfield manipulation code. e.g. orl $65536, 8(%rax) => orb $1, 10(%rax) Since narrowing is not always a win, e.g. i32 -> i16 is a loss on x86, dag combiner consults with the target before performing the optimization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@72507 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 7 ++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 94 +++++++++++++++++++++++- lib/Target/X86/X86ISelLowering.cpp | 5 ++ lib/Target/X86/X86ISelLowering.h | 5 ++ test/CodeGen/X86/narrow_op-1.ll | 23 ++++++ test/CodeGen/X86/narrow_op-2.ll | 23 ++++++ 6 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/X86/narrow_op-1.ll create mode 100644 test/CodeGen/X86/narrow_op-2.ll diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 0576e3e1a8b..dc66e55a1ff 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1420,6 +1420,13 @@ public: return false; } + /// isNarrowingProfitable - Return true if it's profitable to narrow + /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow + /// from i32 to i8 but not from i32 to i16. + virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const { + return true; + } + //===--------------------------------------------------------------------===// // Div utility functions // diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a866cb5629e..6a47aa52a2a 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -41,6 +41,7 @@ using namespace llvm; STATISTIC(NodesCombined , "Number of dag nodes combined"); STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); +STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); namespace { static cl::opt @@ -222,6 +223,7 @@ namespace { SDValue BuildUDIV(SDNode *N); SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL); SDValue ReduceLoadWidth(SDNode *N); + SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue GetDemandedBits(SDValue V, const APInt &Mask); @@ -4900,6 +4902,96 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { return SDValue(); } + +/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is +/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some +/// of the loaded bits, try narrowing the load and store if it would end up +/// being a win for performance or code size. +SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { + StoreSDNode *ST = cast(N); + SDValue Chain = ST->getChain(); + SDValue Value = ST->getValue(); + SDValue Ptr = ST->getBasePtr(); + MVT VT = Value.getValueType(); + + if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) + return SDValue(0, 0); + + unsigned Opc = Value.getOpcode(); + if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || + Value.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(0, 0); + + SDValue N0 = Value.getOperand(0); + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { + LoadSDNode *LD = cast(N0); + if (LD->getBasePtr() != Ptr/* || Chain != N0.getValue(1)*/) + return SDValue(0, 0); + + // Find the type to narrow it the load / op / store to. + SDValue N1 = Value.getOperand(1); + unsigned BitWidth = N1.getValueSizeInBits(); + APInt Imm = cast(N1)->getAPIntValue(); + if (Opc == ISD::AND) + Imm ^= APInt::getAllOnesValue(BitWidth); + unsigned ShAmt = Imm.countTrailingZeros(); + unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; + unsigned NewBW = NextPowerOf2(MSB - ShAmt); + MVT NewVT = MVT::getIntegerVT(NewBW); + while (NewBW < BitWidth && + !(TLI.isTypeLegal(NewVT) && + TLI.isOperationLegalOrCustom(Opc, NewVT) && + TLI.isNarrowingProfitable(VT, NewVT))) { + NewBW = NextPowerOf2(NewBW); + NewVT = MVT::getIntegerVT(NewBW); + } + if (NewBW == BitWidth) + return SDValue(0, 0); + + // If the lsb changed does not start at the type bitwidth boundary, + // start at the previous one. + if (ShAmt % NewBW) + ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; + APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, ShAmt + NewBW); + if ((Imm & Mask) == Imm) { + APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); + if (Opc == ISD::AND) + NewImm ^= APInt::getAllOnesValue(NewBW); + uint64_t PtrOff = ShAmt / 8; + // For big endian targets, we need to adjust the offset to the pointer to + // load the correct bytes. + if (TLI.isBigEndian()) + PtrOff = (BitWidth - NewBW) / 8 - PtrOff; + + unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff); + SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(), + Ptr.getValueType(), Ptr, + DAG.getConstant(PtrOff, Ptr.getValueType())); + SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(), + LD->getChain(), NewPtr, + LD->getSrcValue(), LD->getSrcValueOffset(), + LD->isVolatile(), NewAlign); + SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD, + DAG.getConstant(NewImm, NewVT)); + SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(), + NewVal, NewPtr, + ST->getSrcValue(), ST->getSrcValueOffset(), + ST->isVolatile(), NewAlign); + + AddToWorkList(NewPtr.getNode()); + AddToWorkList(NewLD.getNode()); + AddToWorkList(NewVal.getNode()); + WorkListRemover DeadNodes(*this); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1), + &DeadNodes); + ++OpsNarrowed; + return NewST; + } + } + + return SDValue(0, 0); +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); @@ -5086,7 +5178,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { ST->isVolatile(), ST->getAlignment()); } - return SDValue(); + return ReduceLoadOpStoreWidth(N); } SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b89eef0fb21..0136f90ec43 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6877,6 +6877,11 @@ bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const { return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); } +bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const { + // i16 instructions are longer (0x66 prefix) and potentially slower. + return !(VT1 == MVT::i32 && VT2 == MVT::i16); +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index badbd2462fe..550f8bdf9b6 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -466,6 +466,11 @@ namespace llvm { virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const; virtual bool isZExtFree(MVT VT1, MVT VT2) const; + /// isNarrowingProfitable - Return true if it's profitable to narrow + /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow + /// from i32 to i8 but not from i32 to i16. + virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const; + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask diff --git a/test/CodeGen/X86/narrow_op-1.ll b/test/CodeGen/X86/narrow_op-1.ll new file mode 100644 index 00000000000..0ee11b49558 --- /dev/null +++ b/test/CodeGen/X86/narrow_op-1.ll @@ -0,0 +1,23 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | grep orb | count 1 +; RUN: llvm-as < %s | llc -march=x86-64 | grep orb | grep 1 +; RUN: llvm-as < %s | llc -march=x86-64 | grep orl | count 1 +; RUN: llvm-as < %s | llc -march=x86-64 | grep orl | grep 16842752 + + %struct.bf = type { i64, i16, i16, i32 } +@bfi = common global %struct.bf zeroinitializer, align 16 + +define void @t1() nounwind optsize ssp { +entry: + %0 = load i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8 + %1 = or i32 %0, 65536 + store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8 + ret void +} + +define void @t2() nounwind optsize ssp { +entry: + %0 = load i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8 + %1 = or i32 %0, 16842752 + store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8 + ret void +} diff --git a/test/CodeGen/X86/narrow_op-2.ll b/test/CodeGen/X86/narrow_op-2.ll new file mode 100644 index 00000000000..b441794f42f --- /dev/null +++ b/test/CodeGen/X86/narrow_op-2.ll @@ -0,0 +1,23 @@ +; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | count 2 +; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | grep 254 +; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | grep 253 + + %struct.bf = type { i64, i16, i16, i32 } +@bfi = external global %struct.bf* + +define void @t1() nounwind ssp { +entry: + %0 = load %struct.bf** @bfi, align 8 + %1 = getelementptr %struct.bf* %0, i64 0, i32 1 + %2 = bitcast i16* %1 to i32* + %3 = load i32* %2, align 1 + %4 = and i32 %3, -65537 + store i32 %4, i32* %2, align 1 + %5 = load %struct.bf** @bfi, align 8 + %6 = getelementptr %struct.bf* %5, i64 0, i32 1 + %7 = bitcast i16* %6 to i32* + %8 = load i32* %7, align 1 + %9 = and i32 %8, -131073 + store i32 %9, i32* %7, align 1 + ret void +}