Convert some X86 blendv* intrinsics into IR.

Summary:
Implemented an InstCombine transformation that takes a blendv* intrinsic
call and translates it into an IR select, if the mask is constant.

This will eventually get lowered into blends with immediates if possible,
or pblendvb (with an option to further optimize if we can transform the
pblendvb into a blend+immediate instruction, depending on the selector).
It will also enable optimizations by the IR passes, which give up on
sight of the intrinsic.

Both the transformation and the lowering of its result to asm got shiny
new tests.

The transformation is a bit convoluted because of blendvp[sd]'s
definition:

Its mask is a floating point value! This forces us to convert it and get
the highest bit. I suppose this happened because the mask has type
__m128 in Intel's intrinsic and v4sf (for blendps) in gcc's builtin.

I will send an email to llvm-dev to discuss if we want to change this or
not.

Reviewers: grosbach, delena, nadav

Differential Revision: http://reviews.llvm.org/D3859

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209643 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Filipe Cabecinhas 2014-05-27 03:42:20 +00:00
parent b84ced649e
commit c5f611404c
5 changed files with 157 additions and 0 deletions

View File

@ -718,6 +718,41 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
case Intrinsic::x86_sse41_pblendvb:
case Intrinsic::x86_sse41_blendvps:
case Intrinsic::x86_sse41_blendvpd:
case Intrinsic::x86_avx_blendv_ps_256:
case Intrinsic::x86_avx_blendv_pd_256:
case Intrinsic::x86_avx2_pblendvb: {
// Convert blendv* to vector selects if the mask is constant.
// This optimization is convoluted because the intrinsic is defined as
// getting a vector of floats or doubles for the ps and pd versions.
// FIXME: That should be changed.
Value *Mask = II->getArgOperand(2);
if (auto C = dyn_cast<ConstantDataVector>(Mask)) {
auto Tyi1 = Builder->getInt1Ty();
auto SelectorType = cast<VectorType>(Mask->getType());
auto EltTy = SelectorType->getElementType();
unsigned Size = SelectorType->getNumElements();
unsigned BitWidth = EltTy->isFloatTy() ? 32 : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth());
assert(BitWidth == 64 || BitWidth == 32 || BitWidth == 8 && "Wrong arguments for variable blend intrinsic");
SmallVector<Constant*, 32> Selectors;
for (unsigned I = 0; I < Size; ++I) {
// The intrinsics only read the top bit
uint64_t Selector;
if (BitWidth == 8)
Selector = C->getElementAsInteger(I);
else
Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue();
Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));
}
auto NewSelector = ConstantVector::get(Selectors);
return SelectInst::Create(NewSelector, II->getArgOperand(0), II->getArgOperand(1), "blendv");
} else {
break;
}
}
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:
case Intrinsic::x86_avx_vpermilvar_pd:

View File

@ -135,3 +135,26 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
%min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
ret <2 x double> %min
}
; If we can figure out a blend has a constant mask, we should emit the
; blend instruction with an immediate mask
define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
; CHECK-LABEL: constant_blendvpd_avx:
; CHECK-NOT: mov
; CHECK: vblendpd
; CHECK: ret
%1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
ret <4 x double> %1
}
define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
; CHECK-LABEL: constant_blendvps_avx:
; CHECK-NOT: mov
; CHECK: vblendps
; CHECK: ret
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
ret <8 x float> %1
}
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)

View File

@ -0,0 +1,11 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; CHECK-LABEL: constant_pblendvb_avx2:
; CHECK: vmovdqa
; CHECK: vpblendvb
%1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
ret <32 x i8> %1
}
declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)

View File

@ -88,3 +88,35 @@ entry:
store double %extract214vector_func.i, double addrspace(1)* undef, align 8
ret void
}
; If we can figure out a blend has a constant mask, we should emit the
; blend instruction with an immediate mask
define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
; In this case, we emit a simple movss
; CHECK-LABEL: constant_blendvpd
; CHECK: movsd
; CHECK: ret
%1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
ret <2 x double> %1
}
define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
; CHECK-LABEL: constant_blendvps
; CHECK-NOT: mov
; CHECK: blendps $7
; CHECK: ret
%1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
ret <4 x float> %1
}
define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
; CHECK-LABEL: constant_pblendvb:
; CHECK: movaps
; CHECK: pblendvb
; CHECK: ret
%1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
ret <16 x i8> %1
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)

View File

@ -0,0 +1,56 @@
; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
; CHECK-LABEL: @constant_blendvpd
; CHECK: select <2 x i1> <i1 true, i1 false>
%1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
ret <2 x double> %1
}
define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
; CHECK-LABEL: @constant_blendvps
; CHECK: select <4 x i1> <i1 false, i1 false, i1 false, i1 true>
%1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
ret <4 x float> %1
}
define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
; CHECK-LABEL: @constant_pblendvb
; CHECK: select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>
%1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
ret <16 x i8> %1
}
define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
; CHECK-LABEL: @constant_blendvpd_avx
; CHECK: select <4 x i1> <i1 true, i1 false, i1 true, i1 false>
%1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
ret <4 x double> %1
}
define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
; CHECK-LABEL: @constant_blendvps_avx
; CHECK: select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>
%1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
ret <8 x float> %1
}
define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; CHECK-LABEL: @constant_pblendvb_avx2
; CHECK: select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>
%1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
<32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
ret <32 x i8> %1
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)