Now that we have a canonical way to handle 256-bit splats:

vinsertf128 $1 + vpermilps $0, remove the old code that used to first do the splat in a 128-bit vector and then insert it into a larger one. This is better because the handling code gets simpler and also makes a better room for the upcoming vbroadcast! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@137807 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-16 20:40:16 +00:00 · 2011-08-17 02:29:10 +00:00 · 2011-08-17 02:29:10 +00:00 · fc0a702128
commit fc0a702128
parent 23e9ef994e
2 changed files with 5 additions and 43 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -4205,34 +4205,6 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
  return DAG.getNode(ISD::BITCAST, dl, VT, V);
 }

-/// PromoteVectorToScalarSplat - Since there's no native support for
-/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
-/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
-/// shuffle before the insertion, this yields less instructions in the end.
-static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
-                                          SelectionDAG &DAG) {
-  EVT SrcVT = SV->getValueType(0);
-  SDValue V1 = SV->getOperand(0);
-  DebugLoc dl = SV->getDebugLoc();
-  int NumElems = SrcVT.getVectorNumElements();
-
-  assert(SrcVT.is256BitVector() && "unknown howto handle vector type");
-  assert(SV->isSplat() && "shuffle must be a splat");
-
-  int SplatIdx = SV->getSplatIndex();
-  const int Mask[4] = { SplatIdx, SplatIdx, SplatIdx, SplatIdx };
-
-  EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                             NumElems/2);
-  SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
-                                     DAG.getUNDEF(SVT), Mask);
-  SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
-                                    DAG.getConstant(0, MVT::i32), DAG, dl);
-
-  return Insert128BitVector(InsV, SV1,
-                       DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
-}
-
 /// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
 /// v8i32, v16i16 or v32i8 to v8f32.
 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
@ -6199,16 +6171,6 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
      return Op;

-    // Since there's no native support for scalar_to_vector for 256-bit AVX, a
-    // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
-    // idiom and do the shuffle before the insertion, this yields less
-    // instructions in the end.
-    if (VT.is256BitVector() &&
-        V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
-        V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return PromoteVectorToScalarSplat(SVOp, DAG);
-
    // Handle splats by matching through known shuffle masks
    if (VT.is128BitVector() && NumElem <= 4)
      return SDValue();
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@ -24,8 +24,8 @@ entry:
 }

 ; CHECK: vmovd
-; CHECK-NEXT: movlhps
 ; CHECK-NEXT: vinsertf128 $1
+; CHECK-NEXT: vpermilps $0
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:
  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
@ -35,8 +35,8 @@ entry:
  ret <4 x i64> %vecinit6.i
 }

-; CHECK: vshufpd
-; CHECK-NEXT: vinsertf128 $1
+; CHECK: vinsertf128 $1
+; CHECK-NEXT: vpermilps $0
 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
 entry:
  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
@ -78,8 +78,8 @@ __load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_ex
  ret <8 x float> %load_broadcast12281250
 }

-; CHECK: vpshufd  $0
-; CHECK-NEXT: vinsertf128 $1
+; CHECK: vinsertf128 $1
+; CHECK-NEXT: vpermilps $0
 define <8 x float> @funcF(i32* %ptr) nounwind {
  %val = load i32* %ptr, align 4
  %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6