mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 04:30:23 +00:00
Teach codegen to lower llvm.powi to an efficient (but not optimal)
multiply sequence when the power is a constant integer. Before, our codegen for std::pow(.., int) always turned into a libcall, which was really inefficient. This should also make many gfortran programs happier I'd imagine. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@92388 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0fba8cf9ff
commit
f031e8ad01
@ -462,7 +462,8 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl, unsigned Order,
|
||||
// The number of parts is a power of 2. Repeatedly bisect the value using
|
||||
// EXTRACT_ELEMENT.
|
||||
Parts[0] = DAG.getNode(ISD::BIT_CONVERT, dl,
|
||||
EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()),
|
||||
EVT::getIntegerVT(*DAG.getContext(),
|
||||
ValueVT.getSizeInBits()),
|
||||
Val);
|
||||
|
||||
if (DisableScheduling)
|
||||
@ -4261,6 +4262,59 @@ SelectionDAGBuilder::visitPow(CallInst &I) {
|
||||
setValue(&I, result);
|
||||
}
|
||||
|
||||
|
||||
/// ExpandPowI - Expand a llvm.powi intrinsic.
|
||||
static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
|
||||
SelectionDAG &DAG) {
|
||||
// If RHS is a constant, we can expand this out to a multiplication tree,
|
||||
// otherwise we end up lowering to a call to __powidf2 (for example). When
|
||||
// optimizing for size, we only want to do this if the expansion would produce
|
||||
// a small number of multiplies, otherwise we do the full expansion.
|
||||
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
|
||||
// Get the exponent as a positive value.
|
||||
unsigned Val = RHSC->getSExtValue();
|
||||
if ((int)Val < 0) Val = -Val;
|
||||
|
||||
// powi(x, 0) -> 1.0
|
||||
if (Val == 0)
|
||||
return DAG.getConstantFP(1.0, LHS.getValueType());
|
||||
|
||||
Function *F = DAG.getMachineFunction().getFunction();
|
||||
if (!F->hasFnAttr(Attribute::OptimizeForSize) ||
|
||||
// If optimizing for size, don't insert too many multiplies. This
|
||||
// inserts up to 5 multiplies.
|
||||
CountPopulation_32(Val)+Log2_32(Val) < 7) {
|
||||
// We use the simple binary decomposition method to generate the multiply
|
||||
// sequence. There are more optimal ways to do this (for example,
|
||||
// powi(x,15) generates one more multiply than it should), but this has
|
||||
// the benefit of being both really simple and much better than a libcall.
|
||||
SDValue Res; // Logically starts equal to 1.0
|
||||
SDValue CurSquare = LHS;
|
||||
while (Val) {
|
||||
if (Val & 1)
|
||||
if (Res.getNode())
|
||||
Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
|
||||
else
|
||||
Res = CurSquare; // 1.0*CurSquare.
|
||||
|
||||
CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
|
||||
CurSquare, CurSquare);
|
||||
Val >>= 1;
|
||||
}
|
||||
|
||||
// If the original was negative, invert the result, producing 1/(x*x*x).
|
||||
if (RHSC->getSExtValue() < 0)
|
||||
Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
|
||||
DAG.getConstantFP(1.0, LHS.getValueType()), Res);
|
||||
return Res;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, expand to a libcall.
|
||||
return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
|
||||
}
|
||||
|
||||
|
||||
/// visitIntrinsicCall - Lower the call to the specified intrinsic function. If
|
||||
/// we want to emit this as a call to a named external function, return the name
|
||||
/// otherwise lower it and return null.
|
||||
@ -4536,10 +4590,8 @@ SelectionDAGBuilder::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
|
||||
DAG.AssignOrdering(Res.getNode(), SDNodeOrder);
|
||||
return 0;
|
||||
case Intrinsic::powi:
|
||||
Res = DAG.getNode(ISD::FPOWI, dl,
|
||||
getValue(I.getOperand(1)).getValueType(),
|
||||
getValue(I.getOperand(1)),
|
||||
getValue(I.getOperand(2)));
|
||||
Res = ExpandPowI(dl, getValue(I.getOperand(1)), getValue(I.getOperand(2)),
|
||||
DAG);
|
||||
setValue(&I, Res);
|
||||
if (DisableScheduling)
|
||||
DAG.AssignOrdering(Res.getNode(), SDNodeOrder);
|
||||
|
@ -756,36 +756,6 @@ be done safely if "b" isn't modified between the strlen and memcpy of course.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We generate a horrible libcall for llvm.powi. For example, we compile:
|
||||
|
||||
#include <cmath>
|
||||
double f(double a) { return std::pow(a, 4); }
|
||||
|
||||
into:
|
||||
|
||||
__Z1fd:
|
||||
subl $12, %esp
|
||||
movsd 16(%esp), %xmm0
|
||||
movsd %xmm0, (%esp)
|
||||
movl $4, 8(%esp)
|
||||
call L___powidf2$stub
|
||||
addl $12, %esp
|
||||
ret
|
||||
|
||||
GCC produces:
|
||||
|
||||
__Z1fd:
|
||||
subl $12, %esp
|
||||
movsd 16(%esp), %xmm0
|
||||
mulsd %xmm0, %xmm0
|
||||
mulsd %xmm0, %xmm0
|
||||
movsd %xmm0, (%esp)
|
||||
fldl (%esp)
|
||||
addl $12, %esp
|
||||
ret
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We compile this program: (from GCC PR11680)
|
||||
http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
|
||||
|
||||
|
@ -1,47 +1,30 @@
|
||||
; RUN: llc < %s | grep powixf2
|
||||
; RUN: llc < %s | grep fsqrt
|
||||
; ModuleID = 'yyy.c'
|
||||
; RUN: llc < %s | FileCheck %s
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||
target triple = "i686-apple-darwin8"
|
||||
|
||||
define x86_fp80 @foo(x86_fp80 %x) {
|
||||
define x86_fp80 @foo(x86_fp80 %x) nounwind{
|
||||
entry:
|
||||
%x_addr = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
|
||||
%retval = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
|
||||
%tmp = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
|
||||
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
||||
store x86_fp80 %x, x86_fp80* %x_addr
|
||||
%tmp1 = load x86_fp80* %x_addr, align 16 ; <x86_fp80> [#uses=1]
|
||||
%tmp2 = call x86_fp80 @llvm.sqrt.f80( x86_fp80 %tmp1 ) ; <x86_fp80> [#uses=1]
|
||||
store x86_fp80 %tmp2, x86_fp80* %tmp, align 16
|
||||
%tmp3 = load x86_fp80* %tmp, align 16 ; <x86_fp80> [#uses=1]
|
||||
store x86_fp80 %tmp3, x86_fp80* %retval, align 16
|
||||
br label %return
|
||||
|
||||
return: ; preds = %entry
|
||||
%retval4 = load x86_fp80* %retval ; <x86_fp80> [#uses=1]
|
||||
ret x86_fp80 %retval4
|
||||
%tmp2 = call x86_fp80 @llvm.sqrt.f80( x86_fp80 %x )
|
||||
ret x86_fp80 %tmp2
|
||||
|
||||
; CHECK: foo:
|
||||
; CHECK: fldt 4(%esp)
|
||||
; CHECK-NEXT: fsqrt
|
||||
; CHECK-NEXT: ret
|
||||
}
|
||||
|
||||
declare x86_fp80 @llvm.sqrt.f80(x86_fp80)
|
||||
|
||||
define x86_fp80 @bar(x86_fp80 %x) {
|
||||
define x86_fp80 @bar(x86_fp80 %x) nounwind {
|
||||
entry:
|
||||
%x_addr = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
|
||||
%retval = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
|
||||
%tmp = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
|
||||
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
||||
store x86_fp80 %x, x86_fp80* %x_addr
|
||||
%tmp1 = load x86_fp80* %x_addr, align 16 ; <x86_fp80> [#uses=1]
|
||||
%tmp2 = call x86_fp80 @llvm.powi.f80( x86_fp80 %tmp1, i32 3 ) ; <x86_fp80> [#uses=1]
|
||||
store x86_fp80 %tmp2, x86_fp80* %tmp, align 16
|
||||
%tmp3 = load x86_fp80* %tmp, align 16 ; <x86_fp80> [#uses=1]
|
||||
store x86_fp80 %tmp3, x86_fp80* %retval, align 16
|
||||
br label %return
|
||||
|
||||
return: ; preds = %entry
|
||||
%retval4 = load x86_fp80* %retval ; <x86_fp80> [#uses=1]
|
||||
ret x86_fp80 %retval4
|
||||
%tmp2 = call x86_fp80 @llvm.powi.f80( x86_fp80 %x, i32 3 )
|
||||
ret x86_fp80 %tmp2
|
||||
; CHECK: bar:
|
||||
; CHECK: fldt 4(%esp)
|
||||
; CHECK-NEXT: fld %st(0)
|
||||
; CHECK-NEXT: fmul %st(1)
|
||||
; CHECK-NEXT: fmulp %st(1)
|
||||
; CHECK-NEXT: ret
|
||||
}
|
||||
|
||||
declare x86_fp80 @llvm.powi.f80(x86_fp80, i32)
|
||||
|
11
test/CodeGen/X86/powi.ll
Normal file
11
test/CodeGen/X86/powi.ll
Normal file
@ -0,0 +1,11 @@
|
||||
; RUN: llc %s -march=x86 -mcpu=yonah -o - | grep mulsd | count 6
|
||||
; Ideally this would compile to 5 multiplies.
|
||||
|
||||
define double @_Z3f10d(double %a) nounwind readonly ssp noredzone {
|
||||
entry:
|
||||
%0 = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
|
||||
ret double %0
|
||||
}
|
||||
|
||||
declare double @llvm.powi.f64(double, i32) nounwind readonly
|
||||
|
Loading…
Reference in New Issue
Block a user