From 88873717824208d6d2f8f34bf727550fba1c590d Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 12 Feb 2014 23:42:28 +0000 Subject: [PATCH] [X86] Teach the backend how to lower vector shift left into multiply rather than scalarizing it. Instead of expanding a packed shift into a sequence of scalar shifts, the backend now tries (when possible) to convert the vector shift into a vector multiply. Before this change, a shift of a MVT::v8i16 vector by a build_vector of constants was always scalarized into a long sequence of "vector extracts + scalar shifts + vector insert". With this change, if there is SSE2 support, we emit a single vector multiply. This change also affects SSE4.1, AVX, AVX2 shifts: - A shift of a MVT::v4i32 vector by a build_vector of non uniform constants is now lowered when possible into a single SSE4.1 vector multiply. - Packed v16i16 shift left by constant build_vector are now expanded when possible into a single AVX2 vpmullw. This change also improves the lowering of AVX512f vector shifts. Added test CodeGen/X86/vec_shift6.ll with some code examples that are affected by this change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201271 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 33 +++++++ test/CodeGen/X86/avx-shift.ll | 4 +- test/CodeGen/X86/vec_shift6.ll | 134 +++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/X86/vec_shift6.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 85656d80914..4ce2ea36c00 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13156,6 +13156,39 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return Op; } + // If possible, lower this packed shift into a vector multiply instead of + // expanding it into a sequence of scalar shifts. + // Do this only if the vector shift count is a constant build_vector. + if (Op.getOpcode() == ISD::SHL && + (VT == MVT::v8i16 || VT == MVT::v4i32 || + (Subtarget->hasInt256() && VT == MVT::v16i16)) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + SmallVector Elts; + EVT SVT = VT.getScalarType(); + unsigned SVTBits = SVT.getSizeInBits(); + const APInt &One = APInt(SVTBits, 1); + unsigned NumElems = VT.getVectorNumElements(); + + for (unsigned i=0; i !=NumElems; ++i) { + SDValue Op = Amt->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + Elts.push_back(Op); + continue; + } + + ConstantSDNode *ND = cast(Op); + const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); + uint64_t ShAmt = C.getZExtValue(); + if (ShAmt >= SVTBits) { + Elts.push_back(DAG.getUNDEF(SVT)); + continue; + } + Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems); + return DAG.getNode(ISD::MUL, dl, VT, R, BV); + } + // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll index d79dfcc076b..a70d45a7991 100644 --- a/test/CodeGen/X86/avx-shift.ll +++ b/test/CodeGen/X86/avx-shift.ll @@ -115,8 +115,8 @@ define <8 x i32> @vshift08(<8 x i32> %a) nounwind { ; PR15141 ; CHECK: _vshift13: ; CHECK-NOT: vpsll -; CHECK: vcvttps2dq -; CHECK-NEXT: vpmulld +; CHECK-NOT: vcvttps2dq +; CHECK: vpmulld define <4 x i32> @vshift13(<4 x i32> %in) { %T = shl <4 x i32> %in, ret <4 x i32> %T diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll new file mode 100644 index 00000000000..df2d9cb0468 --- /dev/null +++ b/test/CodeGen/X86/vec_shift6.ll @@ -0,0 +1,134 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512 + + +; Verify that we don't scalarize a packed vector shift left of 16-bit +; signed integers if the amount is a constant build_vector. +; Check that we produce a SSE2 packed integer multiply (pmullw) instead. + +define <8 x i16> @test1(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK-LABEL: test1 +; CHECK: pmullw +; CHECK-NEXT: ret + + +define <8 x i16> @test2(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK-LABEL: test2 +; CHECK: pmullw +; CHECK-NEXT: ret + + +; Verify that a vector shift left of 32-bit signed integers is simply expanded +; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift +; counts is a constant build_vector. + +define <4 x i32> @test3(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK-LABEL: test3 +; CHECK-NOT: cvttps2dq +; SSE: pmulld +; AVX2: vpsllvd +; CHECK-NEXT: ret + + +define <4 x i32> @test4(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK-LABEL: test4 +; CHECK-NOT: cvttps2dq +; SSE: pmulld +; AVX2: vpsllvd +; CHECK-NEXT: ret + + +; If we have AVX/SSE2 but not AVX2, verify that the following shift is split +; into two pmullw instructions. With AVX2, the test case below would produce +; a single vpmullw. + +define <16 x i16> @test5(<16 x i16> %a) { + %shl = shl <16 x i16> %a, + ret <16 x i16> %shl +} +; CHECK-LABEL: test5 +; SSE: pmullw +; SSE-NEXT: pmullw +; AVX2: vpmullw +; AVX2-NOT: vpmullw +; CHECK: ret + + +; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split +; into two pmulld instructions. With AVX2, the test case below would produce +; a single vpsllvd instead. + +define <8 x i32> @test6(<8 x i32> %a) { + %shl = shl <8 x i32> %a, + ret <8 x i32> %shl +} +; CHECK-LABEL: test6 +; SSE: pmulld +; SSE-NEXT: pmulld +; AVX2: vpsllvd +; CHECK: ret + + +; With AVX2 and AVX512, the test case below should produce a sequence of +; two vpmullw instructions. On SSE2 instead, we split the shift in four +; parts and then we convert each part into a pmullw. + +define <32 x i16> @test7(<32 x i16> %a) { + %shl = shl <32 x i16> %a, + ret <32 x i16> %shl +} +; CHECK-LABEL: test7 +; SSE: pmullw +; SSE-NEXT: pmullw +; SSE-NEXT: pmullw +; SSE-NEXT: pmullw +; AVX2: vpmullw +; AVX2-NEXT: vpmullw +; CHECK: ret + + +; Similar to test7; the difference is that with AVX512 support +; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. + +define <16 x i32> @test8(<16 x i32> %a) { + %shl = shl <16 x i32> %a, + ret <16 x i32> %shl +} +; CHECK-LABEL: test8 +; SSE: pmulld +; SSE-NEXT: pmulld +; SSE-NEXT: pmulld +; SSE-NEXT: pmulld +; AVX2ONLY: vpsllvd +; AVX2ONLY-NEXT: vpsllvd +; AVX512: vpsllvd +; AVX512-NOT: vpsllvd +; CHECK: ret + + +; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support. + +define <8 x i64> @test9(<8 x i64> %a) { + %shl = shl <8 x i64> %a, + ret <8 x i64> %shl +} +; CHECK-LABEL: test9 +; AVX2ONLY: vpsllvq +; AVX2ONLY-NEXT: vpsllvq +; AVX512: vpsllvq +; AVX512-NOT: vpsllvq +; CHECK: ret +