mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-02 07:11:49 +00:00
[X86, AVX] instcombine common cases of vperm2* intrinsics into shuffles
vperm2* intrinsics are just shuffles. In a few special cases, they're not even shuffles. Optimizing intrinsics in InstCombine is better than handling this in the front-end for at least two reasons: 1. Optimizing custom-written SSE intrinsic code at -O0 makes vector coders really angry (and so I have regrets about some patches from last week). 2. Doing mask conversion logic in header files is hard to write and subsequently read. There are a couple of TODOs in this patch to complete this optimization. Differential Revision: http://reviews.llvm.org/D8486 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232852 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
e0e1c1d94d
commit
be9ee96926
@ -197,6 +197,57 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
|
||||
/// source vectors, unless a zero bit is set. If a zero bit is set,
|
||||
/// then ignore that half of the mask and clear that half of the vector.
|
||||
static Value *SimplifyX86vperm2(const IntrinsicInst &II,
|
||||
InstCombiner::BuilderTy &Builder) {
|
||||
if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
|
||||
VectorType *VecTy = cast<VectorType>(II.getType());
|
||||
uint8_t Imm = CInt->getZExtValue();
|
||||
|
||||
// The immediate permute control byte looks like this:
|
||||
// [1:0] - select 128 bits from sources for low half of destination
|
||||
// [2] - ignore
|
||||
// [3] - zero low half of destination
|
||||
// [5:4] - select 128 bits from sources for high half of destination
|
||||
// [6] - ignore
|
||||
// [7] - zero high half of destination
|
||||
|
||||
if ((Imm & 0x88) == 0x88) {
|
||||
// If both zero mask bits are set, this was just a weird way to
|
||||
// generate a zero vector.
|
||||
return ConstantAggregateZero::get(VecTy);
|
||||
}
|
||||
|
||||
// TODO: If a single zero bit is set, replace one of the source operands
|
||||
// with a zero vector and use the same mask generation logic as below.
|
||||
|
||||
if ((Imm & 0x88) == 0x00) {
|
||||
// If neither zero mask bit is set, this is a simple shuffle.
|
||||
unsigned NumElts = VecTy->getNumElements();
|
||||
unsigned HalfSize = NumElts / 2;
|
||||
unsigned HalfBegin;
|
||||
SmallVector<int, 8> ShuffleMask(NumElts);
|
||||
|
||||
// Permute low half of result.
|
||||
HalfBegin = (Imm & 0x3) * HalfSize;
|
||||
for (unsigned i = 0; i != HalfSize; ++i)
|
||||
ShuffleMask[i] = HalfBegin + i;
|
||||
|
||||
// Permute high half of result.
|
||||
HalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
|
||||
for (unsigned i = HalfSize; i != NumElts; ++i)
|
||||
ShuffleMask[i] = HalfBegin + i - HalfSize;
|
||||
|
||||
Value *Op0 = II.getArgOperand(0);
|
||||
Value *Op1 = II.getArgOperand(1);
|
||||
return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// visitCallInst - CallInst simplification. This mostly only handles folding
|
||||
/// of intrinsic instructions. For normal calls, it allows visitCallSite to do
|
||||
/// the heavy lifting.
|
||||
@ -904,6 +955,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
return ReplaceInstUsesWith(CI, Shuffle);
|
||||
}
|
||||
|
||||
case Intrinsic::x86_avx_vperm2f128_pd_256:
|
||||
case Intrinsic::x86_avx_vperm2f128_ps_256:
|
||||
case Intrinsic::x86_avx_vperm2f128_si_256:
|
||||
// TODO: Add the AVX2 version of this instruction.
|
||||
if (Value *V = SimplifyX86vperm2(*II, *Builder))
|
||||
return ReplaceInstUsesWith(*II, V);
|
||||
break;
|
||||
|
||||
case Intrinsic::ppc_altivec_vperm:
|
||||
// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
|
||||
// Note that ppc_altivec_vperm has a big-endian bias, so when creating
|
||||
|
236
test/Transforms/InstCombine/x86-vperm2.ll
Normal file
236
test/Transforms/InstCombine/x86-vperm2.ll
Normal file
@ -0,0 +1,236 @@
|
||||
; RUN: opt < %s -instcombine -S | FileCheck %s
|
||||
|
||||
; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
|
||||
|
||||
define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_non_const_imm
|
||||
; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
|
||||
; CHECK-NEXT: ret <4 x double>
|
||||
}
|
||||
|
||||
|
||||
; In the following 3 tests, both zero mask bits of the immediate are set.
|
||||
|
||||
define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x88
|
||||
; CHECK-NEXT: ret <4 x double> zeroinitializer
|
||||
}
|
||||
|
||||
define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
|
||||
%res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
|
||||
ret <8 x float> %res
|
||||
|
||||
; CHECK-LABEL: @perm2ps_0x88
|
||||
; CHECK-NEXT: ret <8 x float> zeroinitializer
|
||||
}
|
||||
|
||||
define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
|
||||
%res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
|
||||
ret <8 x i32> %res
|
||||
|
||||
; CHECK-LABEL: @perm2si_0x88
|
||||
; CHECK-NEXT: ret <8 x i32> zeroinitializer
|
||||
}
|
||||
|
||||
|
||||
; The other control bits are ignored when zero mask bits of the immediate are set.
|
||||
|
||||
define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0xff
|
||||
; CHECK-NEXT: ret <4 x double> zeroinitializer
|
||||
}
|
||||
|
||||
|
||||
; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
|
||||
; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..
|
||||
|
||||
define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x00
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x01
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x02
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x03
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x10
|
||||
; CHECK-NEXT: ret <4 x double> %a0
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x11
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x12
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x13
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x20
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x21
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x22
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x23
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x30
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x31
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x32
|
||||
; CHECK-NEXT: ret <4 x double> %a1
|
||||
}
|
||||
|
||||
define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x33
|
||||
; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <4 x double> %1
|
||||
}
|
||||
|
||||
; Confirm that a mask for 32-bit elements is also correct.
|
||||
|
||||
define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
|
||||
%res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
|
||||
ret <8 x float> %res
|
||||
|
||||
; CHECK-LABEL: @perm2ps_0x31
|
||||
; CHECK-NEXT: %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
|
||||
; CHECK-NEXT: ret <8 x float> %1
|
||||
}
|
||||
|
||||
|
||||
; Confirm that when a single zero mask bit is set, we do nothing.
|
||||
|
||||
define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x83
|
||||
; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125)
|
||||
; CHECK-NEXT: ret <4 x double>
|
||||
}
|
||||
|
||||
|
||||
; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect.
|
||||
|
||||
define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) {
|
||||
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72)
|
||||
ret <4 x double> %res
|
||||
|
||||
; CHECK-LABEL: @perm2pd_0x48
|
||||
; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72)
|
||||
; CHECK-NEXT: ret <4 x double>
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
|
||||
declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
|
||||
declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
|
||||
|
Loading…
Reference in New Issue
Block a user