From be9ee969260fdc03eebcd9f647dcaa5f1384b0cf Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 20 Mar 2015 21:47:56 +0000 Subject: [PATCH] [X86, AVX] instcombine common cases of vperm2* intrinsics into shuffles vperm2* intrinsics are just shuffles. In a few special cases, they're not even shuffles. Optimizing intrinsics in InstCombine is better than handling this in the front-end for at least two reasons: 1. Optimizing custom-written SSE intrinsic code at -O0 makes vector coders really angry (and so I have regrets about some patches from last week). 2. Doing mask conversion logic in header files is hard to write and subsequently read. There are a couple of TODOs in this patch to complete this optimization. Differential Revision: http://reviews.llvm.org/D8486 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232852 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCalls.cpp | 59 +++++ test/Transforms/InstCombine/x86-vperm2.ll | 236 ++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 test/Transforms/InstCombine/x86-vperm2.ll diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 00d92c873bd..b59c9f5d910 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -197,6 +197,57 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return nullptr; } +/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit +/// source vectors, unless a zero bit is set. If a zero bit is set, +/// then ignore that half of the mask and clear that half of the vector. +static Value *SimplifyX86vperm2(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + if (auto CInt = dyn_cast(II.getArgOperand(2))) { + VectorType *VecTy = cast(II.getType()); + uint8_t Imm = CInt->getZExtValue(); + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + + if ((Imm & 0x88) == 0x88) { + // If both zero mask bits are set, this was just a weird way to + // generate a zero vector. + return ConstantAggregateZero::get(VecTy); + } + + // TODO: If a single zero bit is set, replace one of the source operands + // with a zero vector and use the same mask generation logic as below. + + if ((Imm & 0x88) == 0x00) { + // If neither zero mask bit is set, this is a simple shuffle. + unsigned NumElts = VecTy->getNumElements(); + unsigned HalfSize = NumElts / 2; + unsigned HalfBegin; + SmallVector ShuffleMask(NumElts); + + // Permute low half of result. + HalfBegin = (Imm & 0x3) * HalfSize; + for (unsigned i = 0; i != HalfSize; ++i) + ShuffleMask[i] = HalfBegin + i; + + // Permute high half of result. + HalfBegin = ((Imm >> 4) & 0x3) * HalfSize; + for (unsigned i = HalfSize; i != NumElts; ++i) + ShuffleMask[i] = HalfBegin + i - HalfSize; + + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask); + } + } + return nullptr; +} + /// visitCallInst - CallInst simplification. This mostly only handles folding /// of intrinsic instructions. For normal calls, it allows visitCallSite to do /// the heavy lifting. @@ -904,6 +955,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(CI, Shuffle); } + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + // TODO: Add the AVX2 version of this instruction. + if (Value *V = SimplifyX86vperm2(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating diff --git a/test/Transforms/InstCombine/x86-vperm2.ll b/test/Transforms/InstCombine/x86-vperm2.ll new file mode 100644 index 00000000000..92cc4afefa7 --- /dev/null +++ b/test/Transforms/InstCombine/x86-vperm2.ll @@ -0,0 +1,236 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; This should never happen, but make sure we don't crash handling a non-constant immediate byte. + +define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_non_const_imm +; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b) +; CHECK-NEXT: ret <4 x double> +} + + +; In the following 3 tests, both zero mask bits of the immediate are set. + +define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x88 +; CHECK-NEXT: ret <4 x double> zeroinitializer +} + +define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) { + %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136) + ret <8 x float> %res + +; CHECK-LABEL: @perm2ps_0x88 +; CHECK-NEXT: ret <8 x float> zeroinitializer +} + +define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) { + %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136) + ret <8 x i32> %res + +; CHECK-LABEL: @perm2si_0x88 +; CHECK-NEXT: ret <8 x i32> zeroinitializer +} + + +; The other control bits are ignored when zero mask bits of the immediate are set. + +define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0xff +; CHECK-NEXT: ret <4 x double> zeroinitializer +} + + +; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the +; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible.. + +define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x00 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x01 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x02 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x03 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x10 +; CHECK-NEXT: ret <4 x double> %a0 +} + +define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x11 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x12 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x13 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x20 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x21 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x22 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x23 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x30 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x31 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x32 +; CHECK-NEXT: ret <4 x double> %a1 +} + +define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x33 +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> +; CHECK-NEXT: ret <4 x double> %1 +} + +; Confirm that a mask for 32-bit elements is also correct. + +define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) { + %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49) + ret <8 x float> %res + +; CHECK-LABEL: @perm2ps_0x31 +; CHECK-NEXT: %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> +; CHECK-NEXT: ret <8 x float> %1 +} + + +; Confirm that when a single zero mask bit is set, we do nothing. + +define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x83 +; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125) +; CHECK-NEXT: ret <4 x double> +} + + +; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect. + +define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x48 +; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +; CHECK-NEXT: ret <4 x double> +} + +declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone +