From 30ce74b5e3b3cff4bffe93ece4a2b12b184a025e Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Tue, 23 Sep 2014 22:39:02 +0000 Subject: [PATCH] [x86] Teach the new vector shuffle lowering to lower v4i64 vector shuffles using the AVX2 instructions. This is the first step of cutting in real AVX2 support. Note that I have spotted at least one bug in the test cases already, but I suspect it was already present and just is getting surfaced. Will investigate next. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218338 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 63 +++- test/CodeGen/X86/vector-shuffle-256-v4.ll | 374 ++++++++++++++-------- test/CodeGen/X86/vector-shuffle-256-v8.ll | 13 +- 3 files changed, 311 insertions(+), 139 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0f0341e8ace..f01f86f0c0a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7270,12 +7270,17 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, MVT::i8)); + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into // that instruction. if (Subtarget->hasAVX2()) { - int Scale = 4 / VT.getVectorNumElements(); + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= Size) @@ -9372,11 +9377,57 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); - // FIXME: Actually implement this using AVX2!!! - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2); - return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, - DAG.getVectorShuffle(MVT::v4f64, DL, V1, V2, Mask)); + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (Mask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * Mask[i]; + PSHUFDMask[2 * i + 1] = 2 * Mask[i] + 1; + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + } + + // AVX2 provides a direct instruction for permuting a single input across + // lanes. + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + int V1Mask[] = {-1, -1, -1, -1}; + int V2Mask[] = {-1, -1, -1, -1}; + int BlendMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 4; ++i) + if (Mask[i] >= 0 && Mask[i] < 4) { + V1Mask[i] = Mask[i]; + BlendMask[i] = i; + } else if (Mask[i] >= 4) { + V2Mask[i] = Mask[i] - 4; + BlendMask[i] = i + 4; + } + + V1 = DAG.getVectorShuffle(MVT::v4i64, DL, V1, DAG.getUNDEF(MVT::v4i64), V1Mask); + V2 = DAG.getVectorShuffle(MVT::v4i64, DL, V2, DAG.getUNDEF(MVT::v4i64), V2Mask); + return DAG.getVectorShuffle(MVT::v4i64, DL, V1, V2, BlendMask); } /// \brief Handle lowering of 8-lane 32-bit floating point shuffles. diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 175e732b9ab..ddf041f51b7 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -250,203 +250,305 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { } define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0001 -; ALL: # BB#0: -; ALL-NEXT: vunpcklpd {{.*}} # xmm1 = xmm0[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0001 +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0001 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0020 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; ALL-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0020 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0020 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0112 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0112 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0112 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0300 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] -; ALL-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0300 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0300 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_1000 -; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*}} # xmm1 = xmm0[1,0] -; ALL-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_1000 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # xmm1 = xmm0[1,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_1000 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_2200 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] -; ALL-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_2200 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_2200 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_3330 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0] -; ALL-NEXT: vmovhlps {{.*}} # xmm1 = xmm1[1,1] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_3330 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0] +; AVX1-NEXT: vmovhlps {{.*}} # xmm1 = xmm1[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_3330 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_3210 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vpermilpd {{.*}} # xmm1 = xmm1[1,0] -; ALL-NEXT: vpermilpd {{.*}} # xmm0 = xmm0[1,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_3210 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpermilpd {{.*}} # xmm1 = xmm1[1,0] +; AVX1-NEXT: vpermilpd {{.*}} # xmm0 = xmm0[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_3210 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0124 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] -; ALL-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0124 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0124 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm1 = ymm1[0,1,2,0] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0142 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vunpcklpd {{.*}} # xmm2 = xmm2[0,0] -; ALL-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0142 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vunpcklpd {{.*}} # xmm2 = xmm2[0,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0142 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm1 = ymm1[0,1,0,3] +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,1,2,2] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0412 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] -; ALL-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] -; ALL-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0412 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0412 +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*}} # ymm1 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_4012 -; ALL: # BB#0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] -; ALL-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] -; ALL-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_4012 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_4012 +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0145 -; ALL: # BB#0: -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0145 +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0145 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm1 = ymm1[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_0451 -; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*}} # xmm2 = xmm1[1,0] -; ALL-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1] -; ALL-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] -; ALL-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_0451 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # xmm2 = xmm1[1,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_0451 +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*}} # ymm1 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpshufd {{.*}} # ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_4501 -; ALL: # BB#0: -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_4501 +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_4501 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @shuffle_v4i64_4015 -; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*}} # xmm2 = xmm0[1,0] -; ALL-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; ALL-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] -; ALL-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v4i64_4015 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # xmm2 = xmm0[1,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v4i64_4015 +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*}} # ymm1 = ymm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { -; ALL-LABEL: @stress_test1 -; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*}} # xmm0 = xmm1[1,0] -; ALL-NEXT: vpermilpd {{.*}} # xmm0 = xmm0[1,0] -; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vmovhlps {{.*}} # xmm1 = xmm1[1,1] -; ALL-NEXT: vpermilpd {{.*}} # xmm1 = xmm1[1,0] -; ALL-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm0[1] -; ALL-NEXT: vpermilpd {{.*}} # xmm0 = xmm0[1,0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: @stress_test1 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # xmm0 = xmm1[1,0] +; AVX1-NEXT: vpermilpd {{.*}} # xmm0 = xmm0[1,0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vmovhlps {{.*}} # xmm1 = xmm1[1,1] +; AVX1-NEXT: vpermilpd {{.*}} # xmm1 = xmm1[1,0] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; AVX1-NEXT: vpermilpd {{.*}} # xmm0 = xmm0[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @stress_test1 +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm1[3,1,1,0] +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[3,1,2,3] +; AVX2-NEXT: vpermq {{.*}} # ymm1 = ymm1[3,3,1,3] +; AVX2-NEXT: vpshufd {{.*}} # ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpermq {{.*}} # ymm0 = ymm0[0,1,1,0] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> @@ -456,24 +558,38 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { } define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { -; ALL-LABEL: @insert_reg_and_zero_v4i64 -; ALL: # BB#0: -; ALL-NEXT: vmovq %rdi, %xmm0 -; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: @insert_reg_and_zero_v4i64 +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: @insert_reg_and_zero_v4i64 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: retq %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> ret <4 x i64> %shuffle } define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { -; ALL-LABEL: @insert_mem_and_zero_v4i64 -; ALL: # BB#0: -; ALL-NEXT: vmovq (%rdi), %xmm0 -; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; ALL-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: @insert_mem_and_zero_v4i64 +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: @insert_mem_and_zero_v4i64 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: retq %a = load i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 1922150f900..566ce1ea158 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -645,10 +645,15 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: @shuffle_v8i32_01014545 -; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,2] -; ALL-NEXT: retq +; AVX1-LABEL: @shuffle_v8i32_01014545 +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*}} # ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v8i32_01014545 +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle }