[x86] Teach the x86 DAG combiner to form MOVSLDUP and MOVSHDUP

instructions when it finds an appropriate pattern.

These are lovely instructions, and its a shame to not use them. =] They
are fast, and can hand loads folded into their operands, etc.

I've also plumbed the comment shuffle decoding through the various
layers so that the test cases are printed nicely.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217758 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Chandler Carruth 2014-09-15 11:15:23 +00:00
parent 69513f19ba
commit 08780d4c1d
5 changed files with 135 additions and 30 deletions

View File

@ -128,6 +128,42 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeMOVHLPSMask(2, ShuffleMask);
break;
case X86::MOVSLDUPrr:
case X86::VMOVSLDUPrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::MOVSLDUPrm:
case X86::VMOVSLDUPrm:
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
break;
case X86::VMOVSHDUPYrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::VMOVSHDUPYrm:
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
break;
case X86::VMOVSLDUPYrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::VMOVSLDUPYrm:
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
break;
case X86::MOVSHDUPrr:
case X86::VMOVSHDUPrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
// FALL THROUGH.
case X86::MOVSHDUPrm:
case X86::VMOVSHDUPrm:
DestName = getRegName(MI->getOperand(0).getReg());
DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
break;
case X86::PALIGNR128rr:
case X86::VPALIGNR128rr:
Src1Name = getRegName(MI->getOperand(2).getReg());

View File

@ -63,6 +63,22 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.push_back(NElts+i);
}
void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
for (int i = 0, e = NumElts / 2; i < e; ++i) {
ShuffleMask.push_back(2 * i);
ShuffleMask.push_back(2 * i);
}
}
void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
for (int i = 0, e = NumElts / 2; i < e; ++i) {
ShuffleMask.push_back(2 * i + 1);
ShuffleMask.push_back(2 * i + 1);
}
}
void DecodePALIGNRMask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();

View File

@ -38,6 +38,10 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
// <0,2> or <0,1,4,5>
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);

View File

@ -5407,12 +5407,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
if (Mask.empty()) return false;
break;
case X86ISD::MOVSLDUP:
DecodeMOVSLDUPMask(VT, Mask);
break;
case X86ISD::MOVSHDUP:
DecodeMOVSHDUPMask(VT, Mask);
break;
case X86ISD::MOVDDUP:
case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
case X86ISD::MOVSHDUP:
case X86ISD::MOVSLDUP:
// Not yet implemented
return false;
default: llvm_unreachable("unknown target shuffle node");
@ -19364,38 +19368,53 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) {
bool Lo = Mask.equals(0, 0);
unsigned Shuffle;
MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction
// is no slower than UNPCKLPD but has the option to fold the input operand
// into even an unaligned memory load.
if (Lo && Subtarget->hasSSE3()) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v2f64;
} else {
// We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
// than the UNPCK variants.
Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
if (FloatDomain) {
if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
bool Lo = Mask.equals(0, 0);
unsigned Shuffle;
MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction
// is no slower than UNPCKLPD but has the option to fold the input operand
// into even an unaligned memory load.
if (Lo && Subtarget->hasSSE3()) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v2f64;
} else {
// We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
// than the UNPCK variants.
Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
}
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
if (Shuffle == X86ISD::MOVDDUP)
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
else
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
return true;
}
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
if (Shuffle == X86ISD::MOVDDUP)
if (Subtarget->hasSSE3() &&
(Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
bool Lo = Mask.equals(0, 0, 2, 2);
unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
else
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
return true;
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
return true;
}
}
// FIXME: We should match UNPCKLPS and UNPCKHPS here.
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.

View File

@ -119,6 +119,36 @@ define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: @shuffle_v4f32_0022
; SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,2,2]
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_0022
; SSE41: movsldup {{.*}} # xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_0022
; AVX1: vmovsldup {{.*}} # xmm0 = xmm0[0,0,2,2]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
ret <4 x float> %shuffle
}
define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: @shuffle_v4f32_1133
; SSE2: shufps {{.*}} # xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_1133
; SSE41: movshdup {{.*}} # xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_1133
; AVX1: vmovshdup {{.*}} # xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
ret <4 x float> %shuffle
}
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: @shuffle_v4i32_0124