diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 69efae738af..3c13b611ce2 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11973,9 +11973,11 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { return SDValue(); } - // Let's see if the target supports this vector_shuffle. + // Let's see if the target supports this vector_shuffle and make sure + // we're not running after operation legalization where it may have + // custom lowered the vector shuffles. EVT RVT = RHS.getValueType(); - if (!TLI.isVectorClearMaskLegal(Indices, RVT)) + if (LegalOperations || !TLI.isVectorClearMaskLegal(Indices, RVT)) return SDValue(); // Return the new VECTOR_SHUFFLE node. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ce5557991a0..d00b09e71d1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -926,6 +926,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); @@ -994,6 +995,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } @@ -1017,6 +1019,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); @@ -1098,13 +1102,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); - // There is no BLENDI for byte vectors. We don't need to custom lower - // some vselects for now. + // We directly match byte blends in the backend as they match the VSELECT + // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in @@ -1245,11 +1244,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); - setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); @@ -1293,9 +1287,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -1368,6 +1359,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); @@ -1375,6 +1367,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } + if (Subtarget->hasInt256()) + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -13139,48 +13135,29 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend -/// instruction. -static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to a vector shuffle. +static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); + auto *CondBV = cast(Cond); - // Check the mask for BLEND and build the value. - unsigned MaskValue = 0; - if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) - return SDValue(); - - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); - RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + // Only non-legal VSELECTs reach this lowering, convert those into generic + // shuffles and re-use the shuffle lowering path for blends. + SmallVector Mask; + for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { + SDValue CondElt = CondBV->getOperand(i); + Mask.push_back( + isa(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); + return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -13191,10 +13168,16 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); + // Try to lower this to a blend-style vector shuffle. This can handle all + // constant condition cases. + SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; + // Variable blends are only legal from SSE4.1 onward. + if (!Subtarget->hasSSE41()) + return SDValue(); + // Some types for vselect were previously set to Expand, not Legal or // Custom. Return an empty SDValue so we fall-through to Expand, after // the Custom lowering phase. diff --git a/test/Analysis/CostModel/X86/vselect-cost.ll b/test/Analysis/CostModel/X86/vselect-cost.ll index 2416777506d..b7e56ef52a4 100644 --- a/test/Analysis/CostModel/X86/vselect-cost.ll +++ b/test/Analysis/CostModel/X86/vselect-cost.ll @@ -11,7 +11,7 @@ define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2i64': -; SSE2: Cost Model: {{.*}} 4 for instruction: %sel = select <2 x i1> +; SSE2: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> ; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> @@ -21,7 +21,7 @@ define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) { define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2double': -; SSE2: Cost Model: {{.*}} 3 for instruction: %sel = select <2 x i1> +; SSE2: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> ; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <2 x i1> @@ -31,7 +31,7 @@ define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) { define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i32': -; SSE2: Cost Model: {{.*}} 8 for instruction: %sel = select <4 x i1> +; SSE2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> @@ -41,7 +41,7 @@ define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) { define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4float': -; SSE2: Cost Model: {{.*}} 7 for instruction: %sel = select <4 x i1> +; SSE2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> @@ -51,7 +51,7 @@ define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) { define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_16i8': -; SSE2: Cost Model: {{.*}} 32 for instruction: %sel = select <16 x i1> +; SSE2: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> ; SSE41: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> @@ -63,7 +63,7 @@ define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) { ; <8 x float>. Integers of the same size should also use those instructions. define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i64': -; SSE2: Cost Model: {{.*}} 8 for instruction: %sel = select <4 x i1> +; SSE2: Cost Model: {{.*}} 2 for instruction: %sel = select <4 x i1> ; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <4 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> @@ -73,7 +73,7 @@ define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) { define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4double': -; SSE2: Cost Model: {{.*}} 6 for instruction: %sel = select <4 x i1> +; SSE2: Cost Model: {{.*}} 2 for instruction: %sel = select <4 x i1> ; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <4 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <4 x i1> @@ -83,7 +83,7 @@ define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) { define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8i32': -; SSE2: Cost Model: {{.*}} 16 for instruction: %sel = select <8 x i1> +; SSE2: Cost Model: {{.*}} 2 for instruction: %sel = select <8 x i1> ; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <8 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> @@ -93,7 +93,7 @@ define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) { define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8float': -; SSE2: Cost Model: {{.*}} 14 for instruction: %sel = select <8 x i1> +; SSE2: Cost Model: {{.*}} 2 for instruction: %sel = select <8 x i1> ; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <8 x i1> ; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <8 x i1> @@ -104,10 +104,9 @@ define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) { ; AVX2 define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK:Printing analysis 'Cost Model Analysis' for function 'test_16i16': -; SSE2: Cost Model: {{.*}} 32 for instruction: %sel = select <16 x i1> +; SSE2: Cost Model: {{.*}} 2 for instruction: %sel = select <16 x i1> ; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <16 x i1> -;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing. -; AVX: Cost Model: {{.*}} 32 for instruction: %sel = select <16 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <16 x i1> %sel = select <16 x i1> , <16 x i16> %a, <16 x i16> %b ret <16 x i16> %sel @@ -115,10 +114,9 @@ define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) { define <32 x i8> @test_32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_32i8': -; SSE2: Cost Model: {{.*}} 64 for instruction: %sel = select <32 x i1> +; SSE2: Cost Model: {{.*}} 2 for instruction: %sel = select <32 x i1> ; SSE41: Cost Model: {{.*}} 2 for instruction: %sel = select <32 x i1> -;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing. -; AVX: Cost Model: {{.*}} 64 for instruction: %sel = select <32 x i1> +; AVX: Cost Model: {{.*}} 1 for instruction: %sel = select <32 x i1> ; AVX2: Cost Model: {{.*}} 1 for instruction: %sel = select <32 x i1> %sel = select <32 x i1> , <32 x i8> %a, <32 x i8> %b ret <32 x i8> %sel diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index 18203de7294..8d1dabf2683 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -9,16 +9,14 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { ; SSE2-LABEL: vsel_float: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_float: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_float: @@ -65,16 +63,14 @@ entry: define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ; SSE2-LABEL: vsel_4xi8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_4xi8: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_4xi8: @@ -99,16 +95,16 @@ entry: define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { ; SSE2-LABEL: vsel_4xi16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_4xi16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_4xi16: @@ -133,16 +129,16 @@ entry: define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: vsel_i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_i32: @@ -226,16 +222,30 @@ entry: define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { ; SSE2-LABEL: vsel_8xi16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_8xi16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,4,5,2,3,4,5,10,11,4,5,6,7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_8xi16: @@ -255,16 +265,42 @@ entry: define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; SSE2-LABEL: vsel_i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 -; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i8: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_i8: @@ -419,8 +455,8 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ; ; SSE41-LABEL: vsel_i648: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] -; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] ; SSE41-NEXT: movaps %xmm5, %xmm1 ; SSE41-NEXT: movaps %xmm7, %xmm3 ; SSE41-NEXT: retq @@ -586,26 +622,22 @@ entry: define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { ; SSE2-LABEL: constant_blendvps_avx: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0] -; SSE2-NEXT: andps %xmm4, %xmm2 -; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295] -; SSE2-NEXT: andps %xmm5, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: andps %xmm4, %xmm3 -; SSE2-NEXT: andps %xmm5, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: constant_blendvps_avx: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0] -; SSSE3-NEXT: andps %xmm4, %xmm2 -; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295] -; SSSE3-NEXT: andps %xmm5, %xmm0 -; SSSE3-NEXT: orps %xmm2, %xmm0 -; SSSE3-NEXT: andps %xmm4, %xmm3 -; SSSE3-NEXT: andps %xmm5, %xmm1 -; SSSE3-NEXT: orps %xmm3, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: constant_blendvps_avx: @@ -626,26 +658,134 @@ entry: define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { ; SSE2-LABEL: constant_pblendvb_avx2: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; SSE2-NEXT: andps %xmm4, %xmm2 -; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] -; SSE2-NEXT: andps %xmm5, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: andps %xmm4, %xmm3 -; SSE2-NEXT: andps %xmm5, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: # kill: XMM0 XMM4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: packuswb %xmm0, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,3,4,5,6,7] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: packuswb %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: constant_pblendvb_avx2: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; SSSE3-NEXT: andps %xmm4, %xmm2 -; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] -; SSSE3-NEXT: andps %xmm5, %xmm0 -; SSSE3-NEXT: orps %xmm2, %xmm0 -; SSSE3-NEXT: andps %xmm4, %xmm3 -; SSSE3-NEXT: andps %xmm5, %xmm1 -; SSSE3-NEXT: orps %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = <128,128,5,128,128,128,13,128,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: pshufb %xmm8, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = <1,3,128,7,9,11,128,15,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pshufb %xmm6, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <0,128,128,128,8,128,128,128,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm5, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <128,2,4,6,128,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: pshufb %xmm6, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 +; SSSE3-NEXT: pshufb %xmm5, %xmm3 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: constant_pblendvb_avx2: @@ -660,9 +800,27 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { ; ; AVX1-LABEL: constant_pblendvb_avx2: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,5,128,128,128,13,128,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,3,128,7,9,11,128,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX1-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <0,128,128,128,8,128,128,128,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <128,2,4,6,128,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm5 +; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_pblendvb_avx2: diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 2e3d7f5428f..1cbc7780b9f 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -6,9 +6,8 @@ define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; CHECK-NEXT: retq %1 = select <4 x i1> , <4 x float> %a, <4 x float> %b ret <4 x float> %1 @@ -53,9 +52,6 @@ define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test6: ; CHECK: # BB#0: -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; CHECK-NEXT: orps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %a ret <8 x i16> %1 @@ -64,9 +60,8 @@ define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test7: ; CHECK: # BB#0: -; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 @@ -75,9 +70,7 @@ define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test8: ; CHECK: # BB#0: -; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; CHECK-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 @@ -103,10 +96,10 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test11: ; CHECK: # BB#0: -; CHECK-NEXT: movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u> -; CHECK-NEXT: andps %xmm2, %xmm0 -; CHECK-NEXT: andnps %xmm1, %xmm2 -; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq %1 = select <8 x i1> , <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 diff --git a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll index 3b3a7875ab3..ece9895a0b4 100644 --- a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll +++ b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll @@ -50,8 +50,8 @@ define void @vectorselect(i1 %cond) { %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv %8 = icmp ult i64 %indvars.iv, 8 -; A vector select has a cost of 4 on core2 -; CHECK: cost of 4 for VF 2 {{.*}} select i1 %8, i32 %6, i32 0 +; A vector select has a cost of 1 on core2 +; CHECK: cost of 1 for VF 2 {{.*}} select i1 %8, i32 %6, i32 0 %sel = select i1 %8, i32 %6, i32 zeroinitializer store i32 %sel, i32* %7, align 4