From 0e536ee4cae5f7359e8a8db99edadc39a5c12132 Mon Sep 17 00:00:00 2001 From: Jim Grosbach Date: Thu, 31 Oct 2013 00:20:48 +0000 Subject: [PATCH] Legalize: Improve legalization of long vector extends. When an extend more than doubles the size of the elements (e.g., a zext from v16i8 to v16i32), the normal legalization method of splitting the vectors will run into problems as by the time the destination vector is legal, the source vector is illegal. The end result is the operation often becoming scalarized, with the typical horrible performance. For example, on x86_64, the simple input of: define void @bar(<16 x i8> %a, <16 x i32>* %p) nounwind { %tmp = zext <16 x i8> %a to <16 x i32> store <16 x i32> %tmp, <16 x i32>*%p ret void } Generates: .section __TEXT,__text,regular,pure_instructions .section __TEXT,__const .align 5 LCPI0_0: .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .long 255 ## 0xff .section __TEXT,__text,regular,pure_instructions .globl _bar .align 4, 0x90 _bar: vpunpckhbw %xmm0, %xmm0, %xmm1 vpunpckhwd %xmm0, %xmm1, %xmm2 vpmovzxwd %xmm1, %xmm1 vinsertf128 $1, %xmm2, %ymm1, %ymm1 vmovaps LCPI0_0(%rip), %ymm2 vandps %ymm2, %ymm1, %ymm1 vpmovzxbw %xmm0, %xmm3 vpunpckhwd %xmm0, %xmm3, %xmm3 vpmovzxbd %xmm0, %xmm0 vinsertf128 $1, %xmm3, %ymm0, %ymm0 vandps %ymm2, %ymm0, %ymm0 vmovaps %ymm0, (%rdi) vmovaps %ymm1, 32(%rdi) vzeroupper ret So instead we can check if there are legal types that enable us to split more cleverly when the input vector is already legal such that we don't turn it into an illegal type. If the extend is such that it's more than doubling the size of the input we check if - the number of vector elements is even, - the source type is legal, - the type of a split source is illegal, - the type of an extended (by doubling element size) source is legal, and - the type of that extended source when split is legal. If the conditions are met, instead of just splitting both the destination and the source types, we create an extend that only goes up one "step" (doubling the element width), and the continue legalizing the rest of the operation normally. The result is that this operates as a new, more effecient, termination condition for the loop of "split the operation until the destination type is legal." With this change, the above example now compiles to: _bar: vpxor %xmm1, %xmm1, %xmm1 vpunpcklbw %xmm1, %xmm0, %xmm2 vpunpckhwd %xmm1, %xmm2, %xmm3 vpunpcklwd %xmm1, %xmm2, %xmm2 vinsertf128 $1, %xmm3, %ymm2, %ymm2 vpunpckhbw %xmm1, %xmm0, %xmm0 vpunpckhwd %xmm1, %xmm0, %xmm3 vpunpcklwd %xmm1, %xmm0, %xmm0 vinsertf128 $1, %xmm3, %ymm0, %ymm0 vmovaps %ymm0, 32(%rdi) vmovaps %ymm2, (%rdi) vzeroupper ret This generalizes a custom lowering that was added a while back to the ARM backend. That lowering is no longer necessary, and is removed. The testcases for it, however, provide excellent ARM tests for this change and so remain. rdar://14735100 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@193727 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 65 ++++++++++++++++++- lib/Target/ARM/ARMISelLowering.cpp | 55 ---------------- test/CodeGen/X86/long-extend.ll | 18 +++++ 4 files changed, 81 insertions(+), 58 deletions(-) create mode 100644 test/CodeGen/X86/long-extend.ll diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 5085380cbb6..05900d006f0 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -561,6 +561,7 @@ private: void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index ad31f7e94fd..a4e2fc472ed 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -521,7 +521,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_VECTOR_SHUFFLE(cast(N), Lo, Hi); break; - case ISD::ANY_EXTEND: case ISD::CONVERT_RNDSAT: case ISD::CTLZ: case ISD::CTTZ: @@ -548,14 +547,18 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: - case ISD::SIGN_EXTEND: case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: - case ISD::ZERO_EXTEND: SplitVecRes_UnaryOp(N, Lo, Hi); break; + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + SplitVecRes_ExtendOp(N, Lo, Hi); + break; + case ISD::ADD: case ISD::SUB: case ISD::MUL: @@ -921,6 +924,62 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, } } +void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(N); + EVT SrcVT = N->getOperand(0).getValueType(); + EVT DestVT = N->getValueType(0); + EVT LoVT, HiVT; + GetSplitDestVTs(DestVT, LoVT, HiVT); + + // We can do better than a generic split operation if the extend is doing + // more than just doubling the width of the elements and the following are + // true: + // - The number of vector elements is even, + // - the source type is legal, + // - the type of a split source is illegal, + // - the type of an extended (by doubling element size) source is legal, and + // - the type of that extended source when split is legal. + // + // This won't necessarily completely legalize the operation, but it will + // more effectively move in the right direction and prevent falling down + // to scalarization in many cases due to the input vector being split too + // far. + unsigned NumElements = SrcVT.getVectorNumElements(); + if ((NumElements & 1) == 0 && + SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) { + LLVMContext &Ctx = *DAG.getContext(); + EVT NewSrcVT = EVT::getVectorVT( + Ctx, EVT::getIntegerVT( + Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2), + NumElements); + EVT SplitSrcVT = + EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2); + EVT SplitLoVT, SplitHiVT; + GetSplitDestVTs(NewSrcVT, SplitLoVT, SplitHiVT); + if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) && + TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) { + DEBUG(dbgs() << "Split vector extend via incremental extend:"; + N->dump(&DAG); dbgs() << "\n"); + // Extend the source vector by one step. + SDValue NewSrc = + DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0)); + // Get the low and high halves of the new, extended one step, vector. + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitLoVT, NewSrc, + DAG.getConstant(0, TLI.getVectorIdxTy())); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitHiVT, NewSrc, + DAG.getConstant(SplitLoVT.getVectorNumElements(), + TLI.getVectorIdxTy())); + // Extend those vector halves the rest of the way. + Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi); + return; + } + } + // Fall back to the generic unary operator splitting otherwise. + SplitVecRes_UnaryOp(N, Lo, Hi); +} + void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi) { // The low and high parts of the original input give four input vectors. diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 5296b3b848d..2de2dfa0ca7 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -567,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); - // Custom expand long extensions to vectors. - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. @@ -3830,47 +3820,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } -/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec), -/// and size(DestVec) > 128-bits. -/// This is achieved by doing the one extension from the SrcVec, splitting the -/// result, extending these parts, and then concatenating these into the -/// destination. -static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) { - SDValue Op = N->getOperand(0); - EVT SrcVT = Op.getValueType(); - EVT DestVT = N->getValueType(0); - - assert(DestVT.getSizeInBits() > 128 && - "Custom sext/zext expansion needs >128-bit vector."); - // If this is a normal length extension, use the default expansion. - if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() && - SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits()) - return SDValue(); - - SDLoc dl(N); - unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); - unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits(); - unsigned NumElts = SrcVT.getVectorNumElements(); - LLVMContext &Ctx = *DAG.getContext(); - SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi; - - EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), - NumElts); - EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2), - NumElts/2); - EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize), - NumElts/2); - - Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op); - SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, - DAG.getIntPtrConstant(0)); - SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid, - DAG.getIntPtrConstant(NumElts/2)); - ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo); - ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi); -} - /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -6149,10 +6098,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: Res = ExpandBITCAST(N, DAG); break; - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - Res = ExpandVectorExtension(N, DAG); - break; case ISD::SRL: case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll new file mode 100644 index 00000000000..5bbd41dad9d --- /dev/null +++ b/test/CodeGen/X86/long-extend.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s +define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind { +; CHECK-LABEL: test_long_extend +; CHECK: vpunpcklbw %xmm1, %xmm0, [[REG1:%xmm[0-9]+]] +; CHECK: vpunpckhwd %xmm1, [[REG1]], [[REG2:%xmm[0-9]+]] +; CHECK: vpunpcklwd %xmm1, [[REG1]], %x[[REG3:mm[0-9]+]] +; CHECK: vinsertf128 $1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]] +; CHECK: vpunpckhbw %xmm1, %xmm0, [[REG4:%xmm[0-9]+]] +; CHECK: vpunpckhwd %xmm1, [[REG4]], [[REG5:%xmm[0-9]+]] +; CHECK: vpunpcklwd %xmm1, [[REG4]], %x[[REG6:mm[0-9]+]] +; CHECK: vinsertf128 $1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]] +; CHECK: vmovaps [[REG_result1]], 32(%rdi) +; CHECK: vmovaps [[REG_result0]], (%rdi) + + %tmp = zext <16 x i8> %a to <16 x i32> + store <16 x i32> %tmp, <16 x i32>*%p + ret void +}