From 04bb0e721fa93d292bf961e56ce2bb4948b5b46c Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 11 Sep 2014 15:45:27 +0000 Subject: [PATCH] Combine fmul vector FP constants when unsafe math is allowed. This is an extension of the change made with r215820: http://llvm.org/viewvc/llvm-project?view=revision&revision=215820 That patch allowed combining of splatted vector FP constants that are multiplied. This patch allows combining non-uniform vector FP constants too by relaxing the check on the type of vector. Also, canonicalize a vector fmul in the same way that we already do for scalars - if only one operand of the fmul is a constant, make it operand 1. Otherwise, we miss potential folds. This fold is also done by -instcombine, but it's possible that extra fmuls may have been generated during lowering. Differential Revision: http://reviews.llvm.org/D5254 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217599 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 28 +++++++++++--- test/CodeGen/X86/fmul-combines.ll | 48 ++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 156d0a36930..c29200a549e 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6820,8 +6820,16 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { // fold vector ops if (VT.isVector()) { + // This just handles C1 * C2 for vectors. Other vector folds are below. SDValue FoldedVOp = SimplifyVBinOp(N); - if (FoldedVOp.getNode()) return FoldedVOp; + if (FoldedVOp.getNode()) + return FoldedVOp; + // Canonicalize vector constant to RHS. + if (N0.getOpcode() == ISD::BUILD_VECTOR && + N1.getOpcode() != ISD::BUILD_VECTOR) + if (auto *BV0 = dyn_cast(N0)) + if (BV0->isConstant()) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); } // fold (fmul c1, c2) -> c1*c2 @@ -6842,11 +6850,19 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { return N1; // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2)) - if (N1CFP && N0.getOpcode() == ISD::FMUL && - N0.getNode()->hasOneUse() && isConstOrConstSplatFP(N0.getOperand(1))) { - SDLoc SL(N); - SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(1), N1); - return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts); + if (N0.getOpcode() == ISD::FMUL) { + // Fold scalars or any vector constants (not just splats). + // This fold is done in general by InstCombine, but extra fmul insts + // may have been generated during lowering. + SDValue N01 = N0.getOperand(1); + auto *BV1 = dyn_cast(N1); + auto *BV01 = dyn_cast(N01); + if ((N1CFP && isConstOrConstSplatFP(N01)) || + (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) { + SDLoc SL(N); + SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N01, N1); + return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts); + } } // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll index a0122356720..be041073b06 100644 --- a/test/CodeGen/X86/fmul-combines.ll +++ b/test/CodeGen/X86/fmul-combines.ll @@ -55,6 +55,54 @@ define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 { ret <4 x float> %z } +; We should be able to pre-multiply the two constant vectors. +; CHECK: ## float 5.000000e+00 +; CHECK: ## float 1.200000e+01 +; CHECK: ## float 2.100000e+01 +; CHECK: ## float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + %z = fmul <4 x float> %y, + ret <4 x float> %z +} + +; Same as above, but reverse operands to make sure non-canonical form is also handled. +; CHECK: ## float 5.000000e+00 +; CHECK: ## float 1.200000e+01 +; CHECK: ## float 2.100000e+01 +; CHECK: ## float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 { + %y = fmul <4 x float> , %x + %z = fmul <4 x float> , %y + ret <4 x float> %z +} + +; More than one use of a constant multiply should not inhibit the optimization. +; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. +; CHECK: ## float 5.000000e+00 +; CHECK: ## float 1.200000e+01 +; CHECK: ## float 2.100000e+01 +; CHECK: ## float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: +; CHECK: mulps +; CHECK: mulps +; CHECK: addps +; CHECK: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, + %z = fmul <4 x float> %y, + %a = fadd <4 x float> %y, %z + ret <4 x float> %a +} + ; CHECK-LABEL: fmul_c2_c4_f32: ; CHECK-NOT: addss ; CHECK: mulss