diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ed2d7e6e8c0..0bb0f69ff43 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1062,6 +1062,7 @@ void X86TargetLowering::resetOperationActions() { // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); + setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); @@ -1227,6 +1228,7 @@ void X86TargetLowering::resetOperationActions() { // Don't lower v32i8 because there is no 128-bit byte mul setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); @@ -11729,6 +11731,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_pmulhu_w: case Intrinsic::x86_avx2_pmulhu_w: return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), @@ -13168,8 +13175,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } -static SDValue LowerUMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); EVT VT = Op0.getValueType(); SDLoc dl(Op); @@ -13185,15 +13192,17 @@ static SDValue LowerUMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + unsigned Opcode = + Op->getOpcode() == ISD::UMUL_LOHI ? X86ISD::PMULUDQ : X86ISD::PMULDQ; SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(X86ISD::PMULUDQ, dl, MulVT, Op0, Op1)); + DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(X86ISD::PMULUDQ, dl, MulVT, Hi0, Hi1)); + DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1)); // Shuffle it back into the right order. - const int HighMask[] = {1, 3, 5, 7, 9, 11, 13, 15}; + const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15}; SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 2, 4, 6, 8, 10, 12, 14}; + const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14}; SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows); @@ -14188,7 +14197,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); - case ISD::UMUL_LOHI: return LowerUMUL_LOHI(Op, Subtarget, DAG); + case ISD::UMUL_LOHI: + case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index a598d12cc0b..7becb54d654 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -347,6 +347,8 @@ namespace llvm { // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, + // PMULUDQ - Vector multiply packed signed doubleword integers + PMULDQ, // FMA nodes FMADD, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index fc2044a173f..1582f438819 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -175,6 +175,9 @@ def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>>; +def X86pmuldq : SDNode<"X86ISD::PMULDQ", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get // translated into one of the target nodes below during lowering. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index eb036f23eb7..7fc09855770 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7003,6 +7003,31 @@ multiclass SS48I_binop_rm opc, string OpcodeStr, SDNode OpNode, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst +/// types. +multiclass SS48I_binop_rm2 opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : SS48I, + Sched<[itins.Sched]>; + def rm : SS48I, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, @@ -7031,8 +7056,9 @@ let Predicates = [HasAVX] in { defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, VEX_4V; - defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, - 0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V; + defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, + VR128, loadv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -7064,9 +7090,9 @@ let Predicates = [HasAVX2] in { defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; - defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq", - int_x86_avx2_pmul_dq, WriteVecIMul>, - VEX_4V, VEX_L; + defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, + VR256, loadv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -7089,8 +7115,9 @@ let Constraints = "$src1 = $dst" in { memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; - defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, - 1, SSE_INTMUL_ITINS_P>; + defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1>; } let Predicates = [HasAVX] in { diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index 0d43b96163d..06af3434b1a 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE +; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE ; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX define <4 x i32> @test1(<4 x i32> %a) { @@ -103,4 +103,51 @@ define <16 x i16> @test6(<16 x i16> %a) { ; AVX-NOT: vpmulhw } -; TODO: sdiv -> pmuldq +define <16 x i8> @test7(<16 x i8> %a) { + %div = sdiv <16 x i8> %a, + ret <16 x i8> %div +} + +define <4 x i32> @test8(<4 x i32> %a) { + %div = sdiv <4 x i32> %a, + ret <4 x i32> %div + +; SSE-LABEL: test8: +; SSE: pmuldq +; SSE: pshufd $57 +; SSE: pmuldq +; SSE: shufps $-35 +; SSE: pshufd $-40 +; SSE: padd +; SSE: psrld $31 +; SSE: psrad $2 +; SSE: padd + +; AVX-LABEL: test8: +; AVX: vpmuldq +; AVX: vpshufd $57 +; AVX: vpmuldq +; AVX: vshufps $-35 +; AVX: vpshufd $-40 +; AVX: vpadd +; AVX: vpsrld $31 +; AVX: vpsrad $2 +; AVX: vpadd +} + +define <8 x i32> @test9(<8 x i32> %a) { + %div = sdiv <8 x i32> %a, + ret <8 x i32> %div + +; AVX-LABEL: test9: +; AVX: vpermd +; AVX: vpmuldq +; AVX: vshufps $-35 +; AVX: vpmuldq +; AVX: vshufps $-35 +; AVX: vpshufd $-40 +; AVX: vpadd +; AVX: vpsrld $31 +; AVX: vpsrad $2 +; AVX: vpadd +}