From 52981c4b6016d9f0e295e0771ec0a50dd073b4b3 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 20 Feb 2013 12:42:54 +0000 Subject: [PATCH] I optimized the following patterns: sext <4 x i1> to <4 x i64> sext <4 x i8> to <4 x i64> sext <4 x i16> to <4 x i64> I'm running Combine on SIGN_EXTEND_IN_REG and revert SEXT patterns: (sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) The sext_in_reg (v4i32 x) may be lowered to shl+sar operations. The "sar" does not exist on 64-bit operation, so lowering sext_in_reg (v4i64 x) has no vector solution. I also added a cost of this operations to the AVX costs table. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@175619 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 37 +++++++++++++++++++++++ lib/Target/X86/X86TargetTransformInfo.cpp | 3 ++ test/Analysis/CostModel/X86/cast.ll | 12 +++++++- test/CodeGen/X86/avx-sext.ll | 23 ++++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9ed03cd1ee7..a2271663eec 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1323,6 +1323,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -17076,6 +17077,41 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT ExtraVT = cast(N1)->getVT(); + DebugLoc dl = N->getDebugLoc(); + + // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the + // both SSE and AVX2 since there is no sign-extended shift right + // operation on a vector with 64-bit elements. + //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> + // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) + if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND)) { + SDValue N00 = N0.getOperand(0); + + // EXTLOAD has a better solution on AVX2, + // it may be replaced with X86ISD::VSEXT node. + if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) + if (!ISD::isNormalLoad(N00.getNode())) + return SDValue(); + + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + N00, N1); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); + } + } + return SDValue(); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -17468,6 +17504,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index f3dfa0e413f..fefb479da97 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -232,6 +232,9 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, }; diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll index cedc6825988..bacc7786917 100644 --- a/test/Analysis/CostModel/X86/cast.ll +++ b/test/Analysis/CostModel/X86/cast.ll @@ -44,6 +44,10 @@ define i32 @zext_sext(<8 x i1> %in) { %B = zext <8 x i16> undef to <8 x i32> ;CHECK: cost of 1 {{.*}} sext %C = sext <4 x i32> undef to <4 x i64> + ;CHECK: cost of 8 {{.*}} sext + %C1 = sext <4 x i8> undef to <4 x i64> + ;CHECK: cost of 8 {{.*}} sext + %C2 = sext <4 x i16> undef to <4 x i64> ;CHECK: cost of 1 {{.*}} zext %D = zext <4 x i32> undef to <4 x i64> @@ -59,7 +63,7 @@ define i32 @zext_sext(<8 x i1> %in) { ret i32 undef } -define i32 @masks(<8 x i1> %in) { +define i32 @masks8(<8 x i1> %in) { ;CHECK: cost of 6 {{.*}} zext %Z = zext <8 x i1> %in to <8 x i32> ;CHECK: cost of 9 {{.*}} sext @@ -67,3 +71,9 @@ define i32 @masks(<8 x i1> %in) { ret i32 undef } +define i32 @masks4(<4 x i1> %in) { + ;CHECK: cost of 8 {{.*}} sext + %S = sext <4 x i1> %in to <4 x i64> + ret i32 undef +} + diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll index 8d7d79db7de..7ae0d36c080 100755 --- a/test/CodeGen/X86/avx-sext.ll +++ b/test/CodeGen/X86/avx-sext.ll @@ -142,3 +142,26 @@ define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { %Y = sext <8 x i8> %X to <8 x i16> ret <8 x i16>%Y } + +; AVX: sext_4i1_to_4i64 +; AVX: vpslld $31 +; AVX: vpsrad $31 +; AVX: vpmovsxdq +; AVX: vpmovsxdq +; AVX: ret +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { + %extmask = sext <4 x i1> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +; AVX: sext_4i8_to_4i64 +; AVX: vpslld $24 +; AVX: vpsrad $24 +; AVX: vpmovsxdq +; AVX: vpmovsxdq +; AVX: ret +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { + %extmask = sext <4 x i8> %mask to <4 x i64> + ret <4 x i64> %extmask +} +