From 6c327f92a562d9d280bdbc3bde3c0ce269a4c65c Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Thu, 17 Jan 2013 09:59:53 +0000 Subject: [PATCH] Optimization for the following SIGN_EXTEND pairs: v8i8 -> v8i64, v8i8 -> v8i32, v4i8 -> v4i64, v4i16 -> v4i64 for AVX and AVX2. Bug 14865. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172708 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++-- lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- lib/Target/X86/X86ISelLowering.cpp | 27 ++++++++- test/CodeGen/X86/avx-sext.ll | 68 +++++++++++++++++++++++ test/CodeGen/X86/avx2-conversions.ll | 12 ++++ 5 files changed, 119 insertions(+), 10 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a82410ae6a0..3e5a446e6e5 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4298,11 +4298,19 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (isa(N0)) return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N0); - // fold (sext (sext x)) -> (sext x) - // fold (sext (aext x)) -> (sext x) - if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) - return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, - N0.getOperand(0)); + // Folding (sext (sext x)) is obvious, but we do it only after the type + // legalization phase. When the sequence is like {(T1->T2), (T2->T3)} and + // T1 or T3 (or the both) are illegal types, the TypeLegalizer may not + // give a good sequence for the (T1->T3) pair. + // So we give a chance to target specific combiner to optimize T1->T2 and T2->T3 + // separately and may be fold them in a preceding of subsequent instruction. + if (Level >= AfterLegalizeTypes) { + // fold (sext (sext x)) -> (sext x) + // fold (sext (aext x)) -> (sext x) + if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) + return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, + N0.getOperand(0)); + } if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 344d1447a8d..91491bfe802 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2554,9 +2554,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, VT.getVectorNumElements() == Operand.getValueType().getVectorNumElements()) && "Vector element count mismatch!"); - if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); - else if (OpOpcode == ISD::UNDEF) + if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, VT); break; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f42884dd2e8..a8294b6de98 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16970,14 +16970,37 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + + if (!VT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + DebugLoc dl = N->getDebugLoc(); + unsigned ExtenedEltSize = VT.getVectorElementType().getSizeInBits(); + + // Split SIGN_EXTEND operation to use vmovsx instruction when possible + if (InVT == MVT::v8i8) { + if (ExtenedEltSize > 16 && !Subtarget->hasInt256()) + In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, In); + if (ExtenedEltSize > 32) + In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i32, In); + return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, In); + } + + if ((InVT == MVT::v4i8 || InVT == MVT::v4i16) && + ExtenedEltSize > 32 && !Subtarget->hasInt256()) { + In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); + return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, In); + } if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = N->getValueType(0); - if (VT.isVector() && VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) return R; diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll index 8d7d79db7de..5201575f120 100755 --- a/test/CodeGen/X86/avx-sext.ll +++ b/test/CodeGen/X86/avx-sext.ll @@ -142,3 +142,71 @@ define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { %Y = sext <8 x i8> %X to <8 x i16> ret <8 x i16>%Y } +; AVX: sext_1 +; AVX: vpmovsxbd +; AVX: vpmovsxdq +; AVX: vpmovsxdq +; AVX: ret +define void @sext_1(<4 x i8>* %inbuf, <4 x i64>* %outbuf) { + %v0 = load <4 x i8>* %inbuf + %r = sext <4 x i8> %v0 to <4 x i64> + store <4 x i64> %r, <4 x i64>* %outbuf + ret void +} + +; AVX: sext_2 +; AVX: vpmovsxbd +; AVX: ret +define void @sext_2(<4 x i8>* %inbuf, <4 x i32>* %outbuf) { + %v0 = load <4 x i8>* %inbuf + %r = sext <4 x i8> %v0 to <4 x i32> + store <4 x i32> %r, <4 x i32>* %outbuf + ret void +} + +; AVX: sext_3 +; AVX: vpmovsxwd +; AVX: ret +define void @sext_3(<4 x i16>* %inbuf, <4 x i32>* %outbuf) { + %v0 = load <4 x i16>* %inbuf + %r = sext <4 x i16> %v0 to <4 x i32> + store <4 x i32> %r, <4 x i32>* %outbuf + ret void +} + +; AVX: sext_4 +; AVX: vpmovsxwd +; AVX: vpmovsxdq +; AVX: vpmovsxdq +; AVX: ret +define void @sext_4(<4 x i16>* %inbuf, <4 x i64>* %outbuf) { + %v0 = load <4 x i16>* %inbuf + %r = sext <4 x i16> %v0 to <4 x i64> + store <4 x i64> %r, <4 x i64>* %outbuf + ret void +} + +; AVX: sext_5 +; AVX: vpmovsxbw +; AVX: vpmovsxwd +; AVX: vpmovsxwd +; AVX: vpmovsxdq +; AVX: ret +define void @sext_5(<8 x i8>* %inbuf, <8 x i64>* %outbuf) { + %v0 = load <8 x i8>* %inbuf + %r = sext <8 x i8> %v0 to <8 x i64> + store <8 x i64> %r, <8 x i64>* %outbuf + ret void +} +; AVX: sext_6 +; AVX: vpmovsxbw +; AVX: vpmovsxwd +; AVX: vpmovsxwd +; AVX: ret +define void @sext_6(<8 x i8>* %inbuf, <8 x i32>* %outbuf) { + %v0 = load <8 x i8>* %inbuf + %r = sext <8 x i8> %v0 to <8 x i32> + store <8 x i32> %r, <8 x i32>* %outbuf + ret void +} + diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll index 3ce08dcc737..17bd10a76e7 100755 --- a/test/CodeGen/X86/avx2-conversions.ll +++ b/test/CodeGen/X86/avx2-conversions.ll @@ -107,3 +107,15 @@ define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) { %Y = sext <8 x i8> %X to <8 x i32> ret <8 x i32>%Y } + +; CHECK: load_sext_test6 +; CHECK: vpmovsxbd (%r{{[^,]*}}), %ymm{{.*}} +; CHECK: vpmovsxdq +; CHECK: vpmovsxdq +; CHECK: ret +define <8 x i64> @load_sext_test6(<8 x i8> *%ptr) { + %X = load <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i64> + ret <8 x i64>%Y +} +