X86: Turn mul of <4 x i32> into pmuludq when no SSE4.1 is available.

pmuludq is slow, but it turns out that all the unpacking and packing of the
scalarized mul is even slower. 10% speedup on loop-vectorized paq8p.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170985 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Benjamin Kramer 2012-12-22 16:07:56 +00:00
parent 17347912b4
commit 2f8a6cdfa3
2 changed files with 43 additions and 5 deletions

View File

@ -870,6 +870,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal);
setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal);
setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal);
setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal);
@ -11027,17 +11028,43 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) { SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType(); EVT VT = Op.getValueType();
// Decompose 256-bit ops into smaller 128-bit ops. // Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget->hasInt256()) if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntArith(Op, DAG); return Lower256IntArith(Op, DAG);
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
if (VT == MVT::v4i32) {
assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
"Should not custom lower when pmuldq is available!");
// Extract the odd parts.
const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
// Multiply the even parts.
SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
// Now multiply odd parts.
SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}
assert((VT == MVT::v2i64 || VT == MVT::v4i64) && assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
"Only know how to lower V2I64/V4I64 multiply"); "Only know how to lower V2I64/V4I64 multiply");
DebugLoc dl = Op.getDebugLoc();
// Ahi = psrlqi(a, 32); // Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32); // Bhi = psrlqi(b, 32);
// //
@ -11049,9 +11076,6 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// AhiBlo = psllqi(AhiBlo, 32); // AhiBlo = psllqi(AhiBlo, 32);
// return AloBlo + AloBhi + AhiBlo; // return AloBlo + AloBhi + AhiBlo;
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
SDValue ShAmt = DAG.getConstant(32, MVT::i32); SDValue ShAmt = DAG.getConstant(32, MVT::i32);
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);

View File

@ -0,0 +1,14 @@
; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) {
%m = mul <4 x i32> %x, %y
ret <4 x i32> %m
; CHECK: test1:
; CHECK: pshufd $49
; CHECK: pmuludq
; CHECK: pshufd $49
; CHECK: pmuludq
; CHECK: shufps $-120
; CHECK: pshufd $-40
; CHECK: ret
}