[x86] Teach the new vector shuffle lowering about the simplest of

'insertps' patterns.

This replaces two shuffles with a single insertps in very common cases.
My next patch will extend this to leverage the zeroing capabilities of
insertps which will allow it to be used in a much wider set of cases.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217100 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2014-09-03 22:48:34 +00:00
parent 5f209637c4
commit 699fd1909e
2 changed files with 41 additions and 4 deletions

View File

@ -7182,6 +7182,21 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
return true;
}
/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
///
/// Mask entries pointing at the other input or undef will be skipped.
static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
int M = Mask[i];
if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
continue;
if (M - (LoInput ? 0 : Size) != i)
return false;
}
return true;
}
// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
// 2013 will allow us to use it as a non-type template parameter.
namespace {
@ -7365,6 +7380,20 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
Mask.begin();
// Check for whether we can use INSERTPS to perform the blend. We only use
// INSERTPS when the V1 elements are already in the correct locations
// because otherwise we can just always use two SHUFPS instructions which
// are much smaller to encode than a SHUFPS and an INSERTPS.
if (Subtarget->hasSSE41() &&
isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
// Insert the V2 element into the desired position.
SDValue InsertPSMask =
DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
InsertPSMask);
}
// Compute the index adjacent to V2Index and in the same half by toggling
// the low bit.
int V2AdjIndex = V2Index ^ 1;

View File

@ -121,10 +121,18 @@ define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
}
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; ALL-LABEL: @shuffle_v4i32_0124
; ALL: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
; ALL-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
; ALL-NEXT: retq
; SSE2-LABEL: @shuffle_v4i32_0124
; SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4i32_0124
; SSE41: insertps {{.*}} # xmm0 = xmm0[0,1,2],xmm1[0]
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4i32_0124
; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0,1,2],xmm1[0]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %shuffle
}