Add target specific node for PMULUDQ. Change patterns to use it and custom lower intrinsics to it. Use it instead of intrinsic to handle 64-bit vector multiplies.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149807 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Craig Topper 2012-02-05 03:14:49 +00:00
parent 2237f84735
commit 5b209e84f4
4 changed files with 68 additions and 68 deletions

View File

@ -9426,6 +9426,10 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
}
// Arithmetic intrinsics.
case Intrinsic::x86_sse2_pmulu_dq:
case Intrinsic::x86_avx2_pmulu_dq:
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
case Intrinsic::x86_avx_hadd_ps_256:
@ -10085,78 +10089,46 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
return Lower256IntArith(Op, DAG);
assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
"Only know how to lower V2I64/V4I64 multiply");
DebugLoc dl = Op.getDebugLoc();
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
// AloBlo = pmuludq(a, b);
// AloBhi = pmuludq(a, Bhi);
// AhiBlo = pmuludq(Ahi, b);
// AloBhi = psllqi(AloBhi, 32);
// AhiBlo = psllqi(AhiBlo, 32);
// return AloBlo + AloBhi + AhiBlo;
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
if (VT == MVT::v4i64) {
assert(Subtarget->hasAVX2() && "Lowering v4i64 multiply requires AVX2");
SDValue ShAmt = DAG.getConstant(32, MVT::i32);
// ulong2 Ahi = __builtin_ia32_psrlqi256( a, 32);
// ulong2 Bhi = __builtin_ia32_psrlqi256( b, 32);
// ulong2 AloBlo = __builtin_ia32_pmuludq256( a, b );
// ulong2 AloBhi = __builtin_ia32_pmuludq256( a, Bhi );
// ulong2 AhiBlo = __builtin_ia32_pmuludq256( Ahi, b );
//
// AloBhi = __builtin_ia32_psllqi256( AloBhi, 32 );
// AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
// return AloBlo + AloBhi + AhiBlo;
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
DAG.getConstant(32, MVT::i32));
SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
DAG.getConstant(32, MVT::i32));
SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
A, B);
SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
A, Bhi);
SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
Ahi, B);
AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
DAG.getConstant(32, MVT::i32));
AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
DAG.getConstant(32, MVT::i32));
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
return Res;
}
// Bit cast to 32-bit vectors for MULUDQ
EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
// ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
// ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
// ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
// ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
// ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
//
// AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
// AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
// return AloBlo + AloBhi + AhiBlo;
AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
DAG.getConstant(32, MVT::i32));
SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
DAG.getConstant(32, MVT::i32));
SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
A, B);
SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
A, Bhi);
SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
Ahi, B);
AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
DAG.getConstant(32, MVT::i32));
AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
DAG.getConstant(32, MVT::i32));
SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
return Res;
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
@ -11092,6 +11064,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";

View File

@ -219,7 +219,7 @@ namespace llvm {
// VZEXT_MOVL - Vector move low and zero extend.
VZEXT_MOVL,
// VZEXT_MOVL - Vector move low and sign extend.
// VSEXT_MOVL - Vector move low and sign extend.
VSEXT_MOVL,
// VSHL, VSRL - 128-bit vector logical left / right shift
@ -283,6 +283,9 @@ namespace llvm {
VPERM2X128,
VBROADCAST,
// PMULUDQ - Vector multiply packed unsigned doubleword integers
PMULUDQ,
// VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
// according to %al. An operator is needed so that this can be expanded
// with control flow.

View File

@ -109,6 +109,10 @@ def X86vpcomu : SDNode<"X86ISD::VPCOMU",
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>;
def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisSameAs<1,2>]>>;
// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
// translated into one of the target nodes below during lowering.
// Note: this is a work in progress...

View File

@ -3530,6 +3530,26 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
[(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))]>;
}
/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types
multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
bit IsCommutable = 0, bit Is2Addr = 1> {
let isCommutable = IsCommutable in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>;
def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
(bitconvert (memop_frag addr:$src2)))))]>;
}
} // ExeDomain = SSEPackedInt
// 128-bit Integer Arithmetic
@ -3553,6 +3573,8 @@ defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
i128mem, 0, 0>, VEX_4V;
defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
i128mem, 0, 0>, VEX_4V;
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
memopv2i64, i128mem, 1, 0>, VEX_4V;
// Intrinsic forms
defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
@ -3575,8 +3597,6 @@ defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w,
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w,
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq,
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd,
VR128, memopv2i64, i128mem, 1, 0>, VEX_4V;
defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b,
@ -3614,6 +3634,8 @@ defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
i256mem, 0, 0>, VEX_4V;
defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
i256mem, 0, 0>, VEX_4V;
defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
// Intrinsic forms
defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
@ -3636,8 +3658,6 @@ defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
defm VPMULUDQY : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_avx2_pmulu_dq,
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
VR256, memopv4i64, i256mem, 1, 0>, VEX_4V;
defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
@ -3675,6 +3695,8 @@ defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
i128mem>;
defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
i128mem>;
defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
memopv2i64, i128mem, 1>;
// Intrinsic forms
defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
@ -3697,8 +3719,6 @@ defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
VR128, memopv2i64, i128mem, 1>;
defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
VR128, memopv2i64, i128mem, 1>;
defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq,
VR128, memopv2i64, i128mem, 1>;
defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
VR128, memopv2i64, i128mem, 1>;
defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b,