- When DAG combiner is folding a bit convert into a BUILD_VECTOR, it should check if it's essentially a SCALAR_TO_VECTOR. Avoid turning (v8i16) <10, u, u, u> to <10, 0, u, u, u, u, u, u>. Instead, simply convert it to a SCALAR_TO_VECTOR of the proper type.

- X86 now normalize SCALAR_TO_VECTOR to (BIT_CONVERT (v4i32 SCALAR_TO_VECTOR)). Get rid of X86ISD::S2VEC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@47290 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-04 10:30:01 +00:00 · 2008-02-18 23:04:32 +00:00 · 2008-02-18 23:04:32 +00:00 · efec751a1b
commit efec751a1b
parent e0cfecf47d
8 changed files with 128 additions and 59 deletions
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@ -611,6 +611,11 @@ namespace ISD {
  /// BUILD_VECTOR where all of the elements are 0 or undef.
  bool isBuildVectorAllZeros(const SDNode *N);

+  /// isScalarToVector - Return true if the specified node is a
+  /// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
+  /// element is not an undef.
+  bool isScalarToVector(const SDNode *N);
+
  /// isDebugLabel - Return true if the specified node represents a debug
  /// label (i.e. ISD::LABEL or TargetInstrInfo::LABEL node and third operand
  /// is 0).
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -3450,14 +3450,16 @@ ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, MVT::ValueType DstEltVT) {
        Ops.push_back(DAG.getConstant(NewBits, DstEltVT));
    }

-    MVT::ValueType VT = MVT::getVectorType(DstEltVT,
-                                           Ops.size());
+    MVT::ValueType VT = MVT::getVectorType(DstEltVT, Ops.size()); 
    return DAG.getNode(ISD::BUILD_VECTOR, VT, &Ops[0], Ops.size());
  }
  
  // Finally, this must be the case where we are shrinking elements: each input
  // turns into multiple outputs.
+  bool isS2V = ISD::isScalarToVector(BV);
  unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
+  MVT::ValueType VT = MVT::getVectorType(DstEltVT,
+                                     NumOutputsPerInput * BV->getNumOperands());
  SmallVector<SDOperand, 8> Ops;
  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
    if (BV->getOperand(i).getOpcode() == ISD::UNDEF) {
@ -3466,18 +3468,19 @@ ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, MVT::ValueType DstEltVT) {
      continue;
    }
    uint64_t OpVal = cast<ConstantSDNode>(BV->getOperand(i))->getValue();
-
    for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
      unsigned ThisVal = OpVal & ((1ULL << DstBitSize)-1);
-      OpVal >>= DstBitSize;
      Ops.push_back(DAG.getConstant(ThisVal, DstEltVT));
+      if (isS2V && i == 0 && j == 0 && ThisVal == OpVal)
+        // Simply turn this into a SCALAR_TO_VECTOR of the new type.
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Ops[0]);
+      OpVal >>= DstBitSize;
    }

    // For big endian targets, swap the order of the pieces of each element.
    if (TLI.isBigEndian())
      std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
  }
-  MVT::ValueType VT = MVT::getVectorType(DstEltVT, Ops.size());
  return DAG.getNode(ISD::BUILD_VECTOR, VT, &Ops[0], Ops.size());
 }

--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -176,6 +176,27 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
  return true;
 }

+/// isScalarToVector - Return true if the specified node is a
+/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
+/// element is not an undef.
+bool ISD::isScalarToVector(const SDNode *N) {
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return true;
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+  if (N->getOperand(0).getOpcode() == ISD::UNDEF)
+    return false;
+  unsigned NumElems = N->getNumOperands();
+  for (unsigned i = 1; i < NumElems; ++i) {
+    SDOperand V = N->getOperand(i);
+    if (V.getOpcode() != ISD::UNDEF)
+      return false;
+  }
+  return true;
+}
+
+
 /// isDebugLabel - Return true if the specified node represents a debug
 /// label (i.e. ISD::LABEL or TargetInstrInfo::LABEL node and third operand
 /// is 0).
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -583,7 +583,6 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)

    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
  }

@ -3834,7 +3833,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
 SDOperand
 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
-  return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt);
+  MVT::ValueType VT = MVT::v2i32;
+  switch (Op.getValueType()) {
+  default: break;
+  case MVT::v16i8:
+  case MVT::v8i16:
+    VT = MVT::v4i32;
+    break;
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, AnyExt));
 }

 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
@ -5357,7 +5365,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
-  case X86ISD::S2VEC:              return "X86ISD::S2VEC";
  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -166,10 +166,6 @@ namespace llvm {
      /// relative displacements.
      WrapperRIP,

-      /// S2VEC - X86 version of SCALAR_TO_VECTOR. The destination base does not
-      /// have to match the operand type.
-      S2VEC,
-
      /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
      /// i32, corresponds to X86::PEXTRB.
      PEXTRB,
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@ -156,12 +156,13 @@ def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]
 //===----------------------------------------------------------------------===//

 // Data Transfer Instructions
-let neverHasSideEffects = 1 in
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
-                        "movd\t{$src, $dst|$dst, $src}", []>;
-let isSimpleLoad = 1, mayLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (v2i32 (scalar_to_vector GR32:$src)))]>;
+let isSimpleLoad = 1, isReMaterializable = 1 in
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
-                        "movd\t{$src, $dst|$dst, $src}", []>;
+                        "movd\t{$src, $dst|$dst, $src}",
+              [(set VR64:$dst, (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
 let mayStore = 1 in 
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                        "movd\t{$src, $dst|$dst, $src}", []>;
@ -547,27 +548,25 @@ def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
 def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
          (MMX_MOVD64to64rr GR64:$src)>;

-def MMX_X86s2vec : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
-
 // Move scalar to XMM zero-extended
 // movd to XMM register zero-extends
 let AddedComplexity = 15 in {
  def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc,
-                    (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+                    (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))),
+                    MMX_MOVL_shuffle_mask)),
            (MMX_MOVZDI2PDIrr GR32:$src)>;
  def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc,
-                    (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
-            (MMX_MOVZDI2PDIrr GR32:$src)>;
-  def : Pat<(v2i32 (vector_shuffle immAllZerosV,
-                    (v2i32 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+                    (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
+                    MMX_MOVL_shuffle_mask)),
            (MMX_MOVZDI2PDIrr GR32:$src)>;
 }

-// Scalar to v2i32 / v4i16 / v8i8. The source may be a GR32, but only the lower
+// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
 // 8 or 16-bits matter.
-def : Pat<(v8i8  (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
-def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
-def : Pat<(v2i32 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(bc_v8i8  (v2i32 (scalar_to_vector GR32:$src))),
+          (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
+          (MMX_MOVD64rr GR32:$src)>;

 // Patterns to perform canonical versions of vector shuffling.
 let AddedComplexity = 10 in {
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -34,7 +34,6 @@ def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
-def X86s2vec   : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
@ -1781,22 +1780,6 @@ multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                        (bitconvert (memopv2i64 addr:$src2))))]>;
 }

-multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
-                             string OpcodeStr, Intrinsic IntId> {
-  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
-  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1,
-                                        (bitconvert (memopv2i64 addr:$src2))))]>;
-  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
-               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-               [(set VR128:$dst, (IntId VR128:$src1,
-                                        (scalar_to_vector (i32 imm:$src2))))]>;
-}
-
-
 /// PDI_binop_rm - Simple SSE2 binary operator.
 multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                        ValueType OpVT, bit Commutable = 0> {
@ -1871,16 +1854,61 @@ defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
 defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>;


-defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_sse2_psll_w>;
-defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_sse2_psll_d>;
-defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_sse2_psll_q>;
+defm PSLLW : PDI_binop_rm_int<0xF1, "psllw", int_x86_sse2_psll_w>;
+defm PSLLD : PDI_binop_rm_int<0xF2, "pslld", int_x86_sse2_psll_d>;
+defm PSLLQ : PDI_binop_rm_int<0xF3, "psllq", int_x86_sse2_psll_q>;

-defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_sse2_psrl_w>;
-defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_sse2_psrl_d>;
-defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_sse2_psrl_q>;
+defm PSRLW : PDI_binop_rm_int<0xD1, "psrlw", int_x86_sse2_psrl_w>;
+defm PSRLD : PDI_binop_rm_int<0xD2, "psrld", int_x86_sse2_psrl_d>;
+defm PSRLQ : PDI_binop_rm_int<0xD3, "psrlq", int_x86_sse2_psrl_q>;
+
+defm PSRAW : PDI_binop_rm_int<0xE1, "psraw", int_x86_sse2_psra_w>;
+defm PSRAD : PDI_binop_rm_int<0xE2, "psrad", int_x86_sse2_psra_d>;
+
+// Some immediate variants need to match a bit_convert.
+def PSLLWri : PDIi8<0x71, MRM6r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psllw\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1,
+                      (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
+def PSLLDri : PDIi8<0x72, MRM6r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "pslld\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1,
+                          (scalar_to_vector (i32 imm:$src2))))]>;
+def PSLLQri : PDIi8<0x73, MRM6r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psllq\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1,
+                      (bc_v2i64 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
+
+def PSRLWri : PDIi8<0x71, MRM2r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psrlw\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1,
+                      (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
+def PSRLDri : PDIi8<0x72, MRM2r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psrld\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1,
+                          (scalar_to_vector (i32 imm:$src2))))]>;
+def PSRLQri : PDIi8<0x73, MRM2r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psrlq\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1,
+                      (bc_v2i64 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
+
+def PSRAWri : PDIi8<0x71, MRM4r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psraw\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1,
+                      (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>;
+def PSRADri : PDIi8<0x72, MRM4r, (outs VR128:$dst),
+                                 (ins VR128:$src1, i32i8imm:$src2),
+                    "psrad\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1,
+                          (scalar_to_vector (i32 imm:$src2))))]>;

-defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_sse2_psra_w>;
-defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d>;
 // PSRAQ doesn't exist in SSE[1-3].

 // 128-bit logical shifts.
@ -2729,13 +2757,6 @@ let Predicates = [HasSSE2] in
 def : Pat<(fextend (loadf32 addr:$src)),
           (CVTSS2SDrm addr:$src)>;

-// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
-// 16-bits matter.
-def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
-      Requires<[HasSSE2]>;
-def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
-      Requires<[HasSSE2]>;
-
 // bit_convert
 let Predicates = [HasSSE2] in {
  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
--- a/test/CodeGen/X86/vec_shift2.ll
+++ b/test/CodeGen/X86/vec_shift2.ll
@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep CPI
+
+define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind  {
+	%tmp1 = bitcast <2 x i64> %b1 to <8 x i16>
+	%tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone
+	%tmp3 = bitcast <8 x i16> %tmp2 to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind  {
+	%tmp1 = bitcast <2 x i64> %b1 to <4 x i32>
+	%tmp2 = tail call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> %tmp1, <4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > ) nounwind readnone
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone 
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone