diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index 415e291d13f..d6a492d0c19 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -611,6 +611,11 @@ namespace ISD { /// BUILD_VECTOR where all of the elements are 0 or undef. bool isBuildVectorAllZeros(const SDNode *N); + /// isScalarToVector - Return true if the specified node is a + /// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low + /// element is not an undef. + bool isScalarToVector(const SDNode *N); + /// isDebugLabel - Return true if the specified node represents a debug /// label (i.e. ISD::LABEL or TargetInstrInfo::LABEL node and third operand /// is 0). diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 29a31e4e1d6..026666c45fc 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3450,14 +3450,16 @@ ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, MVT::ValueType DstEltVT) { Ops.push_back(DAG.getConstant(NewBits, DstEltVT)); } - MVT::ValueType VT = MVT::getVectorType(DstEltVT, - Ops.size()); + MVT::ValueType VT = MVT::getVectorType(DstEltVT, Ops.size()); return DAG.getNode(ISD::BUILD_VECTOR, VT, &Ops[0], Ops.size()); } // Finally, this must be the case where we are shrinking elements: each input // turns into multiple outputs. + bool isS2V = ISD::isScalarToVector(BV); unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; + MVT::ValueType VT = MVT::getVectorType(DstEltVT, + NumOutputsPerInput * BV->getNumOperands()); SmallVector Ops; for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) { if (BV->getOperand(i).getOpcode() == ISD::UNDEF) { @@ -3466,18 +3468,19 @@ ConstantFoldBIT_CONVERTofBUILD_VECTOR(SDNode *BV, MVT::ValueType DstEltVT) { continue; } uint64_t OpVal = cast(BV->getOperand(i))->getValue(); - for (unsigned j = 0; j != NumOutputsPerInput; ++j) { unsigned ThisVal = OpVal & ((1ULL << DstBitSize)-1); - OpVal >>= DstBitSize; Ops.push_back(DAG.getConstant(ThisVal, DstEltVT)); + if (isS2V && i == 0 && j == 0 && ThisVal == OpVal) + // Simply turn this into a SCALAR_TO_VECTOR of the new type. + return DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Ops[0]); + OpVal >>= DstBitSize; } // For big endian targets, swap the order of the pieces of each element. if (TLI.isBigEndian()) std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); } - MVT::ValueType VT = MVT::getVectorType(DstEltVT, Ops.size()); return DAG.getNode(ISD::BUILD_VECTOR, VT, &Ops[0], Ops.size()); } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c12c98b4e39..f904fa16d58 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -176,6 +176,27 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) { return true; } +/// isScalarToVector - Return true if the specified node is a +/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low +/// element is not an undef. +bool ISD::isScalarToVector(const SDNode *N) { + if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) + return true; + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + if (N->getOperand(0).getOpcode() == ISD::UNDEF) + return false; + unsigned NumElems = N->getNumOperands(); + for (unsigned i = 1; i < NumElems; ++i) { + SDOperand V = N->getOperand(i); + if (V.getOpcode() != ISD::UNDEF) + return false; + } + return true; +} + + /// isDebugLabel - Return true if the specified node represents a debug /// label (i.e. ISD::LABEL or TargetInstrInfo::LABEL node and third operand /// is 0). diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e4eb08b4068..61a4d7a5116 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -583,7 +583,6 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); } @@ -3834,7 +3833,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { SDOperand X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) { SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0)); - return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt); + MVT::ValueType VT = MVT::v2i32; + switch (Op.getValueType()) { + default: break; + case MVT::v16i8: + case MVT::v8i16: + VT = MVT::v4i32; + break; + } + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), + DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, AnyExt)); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as @@ -5357,7 +5365,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; - case X86ISD::S2VEC: return "X86ISD::S2VEC"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index b191dfac33f..ec00066f969 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -166,10 +166,6 @@ namespace llvm { /// relative displacements. WrapperRIP, - /// S2VEC - X86 version of SCALAR_TO_VECTOR. The destination base does not - /// have to match the operand type. - S2VEC, - /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 3f2f1debbf8..c9ea65d9f01 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -156,12 +156,13 @@ def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)] //===----------------------------------------------------------------------===// // Data Transfer Instructions -let neverHasSideEffects = 1 in def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), - "movd\t{$src, $dst|$dst, $src}", []>; -let isSimpleLoad = 1, mayLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (v2i32 (scalar_to_vector GR32:$src)))]>; +let isSimpleLoad = 1, isReMaterializable = 1 in def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), - "movd\t{$src, $dst|$dst, $src}", []>; + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>; let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), "movd\t{$src, $dst|$dst, $src}", []>; @@ -547,27 +548,25 @@ def : Pat<(v4i16 (bitconvert (i64 GR64:$src))), def : Pat<(v8i8 (bitconvert (i64 GR64:$src))), (MMX_MOVD64to64rr GR64:$src)>; -def MMX_X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; - // Move scalar to XMM zero-extended // movd to XMM register zero-extends let AddedComplexity = 15 in { def : Pat<(v8i8 (vector_shuffle immAllZerosV_bc, - (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)), + (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))), + MMX_MOVL_shuffle_mask)), (MMX_MOVZDI2PDIrr GR32:$src)>; def : Pat<(v4i16 (vector_shuffle immAllZerosV_bc, - (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)), - (MMX_MOVZDI2PDIrr GR32:$src)>; - def : Pat<(v2i32 (vector_shuffle immAllZerosV, - (v2i32 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)), + (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))), + MMX_MOVL_shuffle_mask)), (MMX_MOVZDI2PDIrr GR32:$src)>; } -// Scalar to v2i32 / v4i16 / v8i8. The source may be a GR32, but only the lower +// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower // 8 or 16-bits matter. -def : Pat<(v8i8 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>; -def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>; -def : Pat<(v2i32 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>; +def : Pat<(bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))), + (MMX_MOVD64rr GR32:$src)>; +def : Pat<(bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))), + (MMX_MOVD64rr GR32:$src)>; // Patterns to perform canonical versions of vector shuffling. let AddedComplexity = 10 in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 3d225ee7fa0..46f70da83d2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -34,7 +34,6 @@ def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; -def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; def X86pextrb : SDNode<"X86ISD::PEXTRB", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", @@ -1781,22 +1780,6 @@ multiclass PDI_binop_rm_int opc, string OpcodeStr, Intrinsic IntId, (bitconvert (memopv2i64 addr:$src2))))]>; } -multiclass PDI_binop_rmi_int opc, bits<8> opc2, Format ImmForm, - string OpcodeStr, Intrinsic IntId> { - def rr : PDI; - def rm : PDI; - def ri : PDIi8; -} - - /// PDI_binop_rm - Simple SSE2 binary operator. multiclass PDI_binop_rm opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, bit Commutable = 0> { @@ -1871,16 +1854,61 @@ defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>; defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>; -defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_sse2_psll_w>; -defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_sse2_psll_d>; -defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_sse2_psll_q>; +defm PSLLW : PDI_binop_rm_int<0xF1, "psllw", int_x86_sse2_psll_w>; +defm PSLLD : PDI_binop_rm_int<0xF2, "pslld", int_x86_sse2_psll_d>; +defm PSLLQ : PDI_binop_rm_int<0xF3, "psllq", int_x86_sse2_psll_q>; -defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_sse2_psrl_w>; -defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_sse2_psrl_d>; -defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_sse2_psrl_q>; +defm PSRLW : PDI_binop_rm_int<0xD1, "psrlw", int_x86_sse2_psrl_w>; +defm PSRLD : PDI_binop_rm_int<0xD2, "psrld", int_x86_sse2_psrl_d>; +defm PSRLQ : PDI_binop_rm_int<0xD3, "psrlq", int_x86_sse2_psrl_q>; + +defm PSRAW : PDI_binop_rm_int<0xE1, "psraw", int_x86_sse2_psra_w>; +defm PSRAD : PDI_binop_rm_int<0xE2, "psrad", int_x86_sse2_psra_d>; + +// Some immediate variants need to match a bit_convert. +def PSLLWri : PDIi8<0x71, MRM6r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psllw\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_w VR128:$src1, + (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>; +def PSLLDri : PDIi8<0x72, MRM6r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "pslld\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_d VR128:$src1, + (scalar_to_vector (i32 imm:$src2))))]>; +def PSLLQri : PDIi8<0x73, MRM6r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psllq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psll_q VR128:$src1, + (bc_v2i64 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>; + +def PSRLWri : PDIi8<0x71, MRM2r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psrlw\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_w VR128:$src1, + (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>; +def PSRLDri : PDIi8<0x72, MRM2r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psrld\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_d VR128:$src1, + (scalar_to_vector (i32 imm:$src2))))]>; +def PSRLQri : PDIi8<0x73, MRM2r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psrlq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psrl_q VR128:$src1, + (bc_v2i64 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>; + +def PSRAWri : PDIi8<0x71, MRM4r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psraw\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psra_w VR128:$src1, + (bc_v8i16 (v4i32 (scalar_to_vector (i32 imm:$src2))))))]>; +def PSRADri : PDIi8<0x72, MRM4r, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "psrad\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_psra_d VR128:$src1, + (scalar_to_vector (i32 imm:$src2))))]>; -defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_sse2_psra_w>; -defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d>; // PSRAQ doesn't exist in SSE[1-3]. // 128-bit logical shifts. @@ -2729,13 +2757,6 @@ let Predicates = [HasSSE2] in def : Pat<(fextend (loadf32 addr:$src)), (CVTSS2SDrm addr:$src)>; -// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or -// 16-bits matter. -def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, - Requires<[HasSSE2]>; -def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, - Requires<[HasSSE2]>; - // bit_convert let Predicates = [HasSSE2] in { def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; diff --git a/test/CodeGen/X86/vec_shift2.ll b/test/CodeGen/X86/vec_shift2.ll new file mode 100644 index 00000000000..b73f5f49000 --- /dev/null +++ b/test/CodeGen/X86/vec_shift2.ll @@ -0,0 +1,17 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep CPI + +define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind { + %tmp1 = bitcast <2 x i64> %b1 to <8 x i16> + %tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone + %tmp3 = bitcast <8 x i16> %tmp2 to <2 x i64> + ret <2 x i64> %tmp3 +} + +define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind { + %tmp1 = bitcast <2 x i64> %b1 to <4 x i32> + %tmp2 = tail call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> %tmp1, <4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > ) nounwind readnone + ret <4 x i32> %tmp2 +} + +declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone