From 689edc8b2845f7a6cf9403722de2000598f68489 Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Thu, 16 Jun 2011 01:21:54 +0000 Subject: [PATCH] Revision r128665 added an optimization to make use of NEON multiplier accumulator forwarding. Specifically (from SVN log entry): Distribute (A + B) * C to (A * C) + (B * C) to make use of NEON multiplier accumulator forwarding: vadd d3, d0, d1 vmul d3, d3, d2 => vmul d3, d0, d2 vmla d3, d1, d2 Make sure it catches cases where operand 1 is add/fadd/sub/fsub, which was intended in the original revision. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@133127 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 2 +- test/CodeGen/ARM/vmul.ll | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index faa86139424..8b7383de124 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -5687,7 +5687,7 @@ static SDValue PerformVMULCombine(SDNode *N, unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) { - Opcode = N0.getOpcode(); + Opcode = N1.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) return SDValue(); diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index 1fd6581ae08..d62b5d415f8 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -492,3 +492,25 @@ entry: store <8 x i8> %10, <8 x i8>* %11, align 8 ret void } + +define void @distribute2_commutative(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind { +entry: +; CHECK: distribute2_commutative +; CHECK-NOT: vadd.i8 +; CHECK: vmul.i8 +; CHECK: vmla.i8 + %0 = trunc i32 %mul to i8 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1) + %4 = bitcast <16 x i8> %3 to <2 x double> + %5 = extractelement <2 x double> %4, i32 1 + %6 = bitcast double %5 to <8 x i8> + %7 = extractelement <2 x double> %4, i32 0 + %8 = bitcast double %7 to <8 x i8> + %9 = add <8 x i8> %6, %8 + %10 = mul <8 x i8> %2, %9 + %11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0 + store <8 x i8> %10, <8 x i8>* %11, align 8 + ret void +}