From 04bb0e721fa93d292bf961e56ce2bb4948b5b46c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 11 Sep 2014 15:45:27 +0000
Subject: [PATCH] Combine fmul vector FP constants when unsafe math is allowed.

This is an extension of the change made with r215820:
http://llvm.org/viewvc/llvm-project?view=revision&revision=215820

That patch allowed combining of splatted vector FP constants that are multiplied.

This patch allows combining non-uniform vector FP constants too by relaxing the
check on the type of vector. Also, canonicalize a vector fmul in the
same way that we already do for scalars - if only one operand of the fmul is a
constant, make it operand 1. Otherwise, we miss potential folds.

This fold is also done by -instcombine, but it's possible that extra
fmuls may have been generated during lowering.

Differential Revision: http://reviews.llvm.org/D5254



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217599 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 28 +++++++++++---
 test/CodeGen/X86/fmul-combines.ll        | 48 ++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 6 deletions(-)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 156d0a36930..c29200a549e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6820,8 +6820,16 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
 
   // fold vector ops
   if (VT.isVector()) {
+    // This just handles C1 * C2 for vectors. Other vector folds are below.
     SDValue FoldedVOp = SimplifyVBinOp(N);
-    if (FoldedVOp.getNode()) return FoldedVOp;
+    if (FoldedVOp.getNode())
+      return FoldedVOp;
+    // Canonicalize vector constant to RHS.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+        N1.getOpcode() != ISD::BUILD_VECTOR)
+      if (auto *BV0 = dyn_cast<BuildVectorSDNode>(N0))
+        if (BV0->isConstant())
+          return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
   }
 
   // fold (fmul c1, c2) -> c1*c2
@@ -6842,11 +6850,19 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
       return N1;
 
     // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
-    if (N1CFP && N0.getOpcode() == ISD::FMUL &&
-        N0.getNode()->hasOneUse() && isConstOrConstSplatFP(N0.getOperand(1))) {
-      SDLoc SL(N);
-      SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(1), N1);
-      return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts);
+    if (N0.getOpcode() == ISD::FMUL) {
+      // Fold scalars or any vector constants (not just splats).
+      // This fold is done in general by InstCombine, but extra fmul insts
+      // may have been generated during lowering.
+      SDValue N01 = N0.getOperand(1);
+      auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+      auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);
+      if ((N1CFP && isConstOrConstSplatFP(N01)) ||
+          (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
+        SDLoc SL(N);
+        SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N01, N1);
+        return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts);
+      }
     }
 
     // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index a0122356720..be041073b06 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -55,6 +55,54 @@ define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
   ret <4 x float> %z
 }
 
+; We should be able to pre-multiply the two constant vectors.
+; CHECK: ## float 5.000000e+00
+; CHECK: ## float 1.200000e+01
+; CHECK: ## float 2.100000e+01
+; CHECK: ## float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  ret <4 x float> %z
+}
+
+; Same as above, but reverse operands to make sure non-canonical form is also handled.
+; CHECK: ## float 5.000000e+00
+; CHECK: ## float 1.200000e+01
+; CHECK: ## float 2.100000e+01
+; CHECK: ## float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 {
+  %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+  %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y
+  ret <4 x float> %z
+}
+
+; More than one use of a constant multiply should not inhibit the optimization.
+; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. 
+; CHECK: ## float 5.000000e+00
+; CHECK: ## float 1.200000e+01
+; CHECK: ## float 2.100000e+01
+; CHECK: ## float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
+; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
+; CHECK: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %a = fadd <4 x float> %y, %z
+  ret <4 x float> %a
+}
+
 ; CHECK-LABEL: fmul_c2_c4_f32:
 ; CHECK-NOT: addss
 ; CHECK: mulss