[DAGCombiner] Convert constant AND masks to shuffle clear masks down to the byte level

The XformToShuffleWithZero method currently checks AND masks at the per-lane level for all-one and all-zero constants and attempts to convert them to legal shuffle clear masks.

This patch generalises XformToShuffleWithZero, splitting and checking the sub-lanes of the constants down to the byte level to see if any legal shuffle clear masks are possible. This allows a lot of masks (often from legalization or truncation) to be folded into existing shuffle patterns and removes a lot of constant mask loading.

There are a few examples of poor shuffle lowering that are exposed by this patch that will be cleaned up in future patches (e.g. merging shuffles that are separated by bitcasts, x86 legalized v8i8 zero extension uses PMOVZX+AND+AND instead of AND+PMOVZX, etc.)

Differential Revision: http://reviews.llvm.org/D11518

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@243831 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Simon Pilgrim 2015-08-01 10:01:46 +00:00
parent b80eede641
commit fa2c240c15
10 changed files with 226 additions and 229 deletions

View File

@ -13002,34 +13002,76 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
if (RHS.getOpcode() == ISD::BITCAST)
RHS = RHS.getOperand(0);
if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
SmallVector<int, 8> Indices;
unsigned NumElts = RHS.getNumOperands();
if (RHS.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
for (unsigned i = 0; i != NumElts; ++i) {
SDValue Elt = RHS.getOperand(i);
if (isAllOnesConstant(Elt))
EVT RVT = RHS.getValueType();
unsigned NumElts = RHS.getNumOperands();
// Attempt to create a valid clear mask, splitting the mask into
// sub elements and checking to see if each is
// all zeros or all ones - suitable for shuffle masking.
auto BuildClearMask = [&](int Split) {
int NumSubElts = NumElts * Split;
int NumSubBits = RVT.getScalarSizeInBits() / Split;
SmallVector<int, 8> Indices;
for (int i = 0; i != NumSubElts; ++i) {
int EltIdx = i / Split;
int SubIdx = i % Split;
SDValue Elt = RHS.getOperand(EltIdx);
if (Elt.getOpcode() == ISD::UNDEF) {
Indices.push_back(-1);
continue;
}
APInt Bits;
if (isa<ConstantSDNode>(Elt))
Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
else if (isa<ConstantFPSDNode>(Elt))
Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
else
return SDValue();
// Extract the sub element from the constant bit mask.
if (DAG.getDataLayout().isBigEndian()) {
Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits);
} else {
Bits = Bits.lshr(SubIdx * NumSubBits);
}
if (Split > 1)
Bits = Bits.trunc(NumSubBits);
if (Bits.isAllOnesValue())
Indices.push_back(i);
else if (isNullConstant(Elt))
Indices.push_back(NumElts+i);
else if (Bits == 0)
Indices.push_back(i + NumSubElts);
else
return SDValue();
}
// Let's see if the target supports this vector_shuffle.
EVT RVT = RHS.getValueType();
if (!TLI.isVectorClearMaskLegal(Indices, RVT))
EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
return SDValue();
// Return the new VECTOR_SHUFFLE node.
EVT EltVT = RVT.getVectorElementType();
SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
DAG.getConstant(0, dl, EltVT));
SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, RVT, ZeroOps);
LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuf);
}
SDValue Zero = DAG.getConstant(0, dl, ClearVT);
return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl,
DAG.getBitcast(ClearVT, LHS),
Zero, &Indices[0]));
};
// Determine maximum split level (byte level masking).
int MaxSplit = 1;
if (RVT.getScalarSizeInBits() % 8 == 0)
MaxSplit = RVT.getScalarSizeInBits() / 8;
for (int Split = 1; Split <= MaxSplit; ++Split)
if (RVT.getScalarSizeInBits() % Split == 0)
if (SDValue S = BuildClearMask(Split))
return S;
return SDValue();
}
@ -13042,9 +13084,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if (SDValue Shuffle = XformToShuffleWithZero(N))
return Shuffle;
// If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
// this operation.
if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
@ -13095,6 +13134,10 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
}
// Try to convert a constant mask AND into a shuffle clear mask.
if (SDValue Shuffle = XformToShuffleWithZero(N))
return Shuffle;
// Type legalization might introduce new shuffles in the DAG.
// Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
// -> (shuffle (VBinOp (A, B)), Undef, Mask).

View File

@ -62,8 +62,7 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
; CHECK-LABEL: zext_8i8_8i32:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
%B = zext <8 x i8> %A to <8 x i32>
ret <8 x i32>%B

View File

@ -192,8 +192,8 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; SKX-LABEL: test15:
; SKX: ## BB#0:
; SKX-NEXT: vpandq {{.*}}(%rip), %xmm0, %xmm0
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq

View File

@ -313,6 +313,7 @@ define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: @shl_zext_srl_v4i32
; CHECK: andps
; CHECK: andps
; CHECK-NEXT: ret
define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
%srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>

View File

@ -269,8 +269,10 @@ entry:
define <4 x i32> @t17() nounwind {
; X64-LABEL: t17:
; X64: ## BB#0: ## %entry
; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: andpd {{.*}}(%rip), %xmm0
; X64-NEXT: movaps (%rax), %xmm0
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
entry:
%tmp1 = load <4 x float>, <4 x float>* undef, align 16

View File

@ -46,10 +46,12 @@ define <4 x float> @foo1_4(<4 x i8> %src) {
define <8 x float> @foo2_8(<8 x i8> %src) {
; CHECK-LABEL: foo2_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vandps LCPI2_0, %ymm0, %ymm0
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
;

View File

@ -455,18 +455,14 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
; SSE-LABEL: uitofp_2i16_to_2f64:
; SSE: # BB#0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_2i16_to_2f64:
; AVX: # BB#0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
@ -503,19 +499,15 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
; SSE-LABEL: uitofp_2i8_to_2f64:
; SSE: # BB#0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_2i8_to_2f64:
; AVX: # BB#0:
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
@ -632,31 +624,31 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_4i32_to_4f64:
; SSE: # BB#0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
; SSE-NEXT: subpd %xmm4, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
; SSE-NEXT: addpd %xmm5, %xmm0
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE-NEXT: subpd %xmm4, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
; SSE-NEXT: addpd %xmm1, %xmm5
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
; SSE-NEXT: pand {{.*}}(%rip), %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
; SSE-NEXT: subpd %xmm5, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
; SSE-NEXT: addpd %xmm6, %xmm0
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE-NEXT: subpd %xmm5, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
; SSE-NEXT: addpd %xmm4, %xmm6
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: subpd %xmm4, %xmm2
; SSE-NEXT: subpd %xmm5, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
; SSE-NEXT: addpd %xmm2, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSE-NEXT: subpd %xmm4, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
; SSE-NEXT: addpd %xmm5, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE-NEXT: subpd %xmm5, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
; SSE-NEXT: addpd %xmm4, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE-NEXT: retq
;
@ -1725,8 +1717,7 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
@ -1767,18 +1758,19 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
; AVX1-LABEL: uitofp_8i8_to_8f32:
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i8_to_8f32:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

View File

@ -39,8 +39,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_16i16:
@ -48,16 +47,15 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_16i16:
@ -125,34 +123,31 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i32:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_8i32:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_8i32:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@ -163,28 +158,25 @@ entry:
define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_2i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_2i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_2i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_16i8_to_2i64:
; AVX: # BB#0: # %entry
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
@ -195,55 +187,48 @@ entry:
define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,1,1,2,2,3,3,3,3,5,5,2,2,3,3]
; SSSE3-NEXT: pand %xmm0, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_4i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255]
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,1,1,2,2,3,3,3,3,5,5,2,2,3,3]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_4i64:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_4i64:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@ -285,8 +270,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_8i32:
@ -294,16 +278,15 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_8i32:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i16_to_8i32:
@ -326,28 +309,26 @@ entry:
define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_2i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_2i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_2i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_8i16_to_2i64:
; AVX: # BB#0: # %entry
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
@ -358,52 +339,49 @@ entry:
define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,6,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5],zero,zero,zero,zero,zero,zero,xmm1[6,7],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_4i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535]
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,6,7,10,11,4,5,6,7]
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i16_to_4i64:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i16_to_4i64:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@ -414,26 +392,24 @@ entry:
define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_4i32_to_2i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_4i32_to_2i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_4i32_to_2i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_4i32_to_2i64:
; AVX: # BB#0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
@ -444,32 +420,26 @@ entry:
define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_4i32_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_4i32_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSSE3-NEXT: pand %xmm3, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_4i32_to_4i64:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSE41-NEXT: pand %xmm3, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSE41-NEXT: pand %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_4i32_to_4i64:
@ -561,7 +531,7 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
@ -572,11 +542,9 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i8_to_4i64:
@ -639,7 +607,7 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pand %xmm2, %xmm1
@ -650,11 +618,8 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i32:
@ -687,8 +652,7 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_16i8_to_16i16:
@ -697,8 +661,7 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_16i8_to_16i16:
@ -792,7 +755,7 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
@ -802,11 +765,9 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i16_to_4i64:
@ -839,8 +800,7 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i16_to_8i32:
@ -849,8 +809,7 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i16_to_8i32:
@ -910,21 +869,19 @@ define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSE2-LABEL: load_zext_4i32_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i32_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa (%rdi), %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSSE3-NEXT: pand %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i32_to_4i64:
@ -955,7 +912,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pand %xmm0, %xmm1
@ -966,7 +923,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSSE3-NEXT: pand %xmm1, %xmm2
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pand %xmm0, %xmm1
@ -976,7 +933,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSE41-LABEL: zext_8i8_to_8i32:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pand %xmm0, %xmm1
@ -985,17 +942,18 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
;
; AVX1-LABEL: zext_8i8_to_8i32:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i8_to_8i32:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
%t = zext <8 x i8> %z to <8 x i32>

View File

@ -62,13 +62,15 @@ bb:
; CHECK-LABEL: test3:
; Compute the mask.
; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
; Do not shrink the bit of the mask.
; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}}
; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}}
; Use the mask in the blend.
; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
; Use the mask in the and.
; CHECK-NEXT: vpand LCPI2_2(%rip), [[MASK]], {{%xmm[0-9]+}}
; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
; Shuffle mask to truncate.
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
; CHECK: retq
define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>

View File

@ -191,9 +191,7 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x
; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]])
; CHECK-NEXT: movb $1, 2(%[[PTR1]])
; CHECK-NEXT: movl (%[[PTR0]]), [[TMP1:%e[abcd]+x]]
; CHECK-NEXT: movl [[TMP1]], [[TMP2:.*]]
; CHECK-NEXT: pmovzxbd [[TMP2]], %[[X0:xmm[0-9]+]]
; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
; CHECK-NEXT: movdqa %[[X0]], %[[X1:xmm[0-9]+]]
; CHECK-NEXT: psrld $1, %[[X1]]
; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]]