From 112dedc520c1aec387a6fef1c8f512a7d27f0570 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Sat, 29 Dec 2007 06:41:28 +0000 Subject: [PATCH] avoid going through a stack slot to convert from fpstack to xmm reg if we are just going to store it back anyway. This improves things like: double foo(); void bar(double *P) { *P = foo(); } git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@45399 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/README.txt | 21 ------------ lib/Target/X86/X86ISelLowering.cpp | 47 +++++++++++++++++++++----- test/CodeGen/X86/fp-stack-ret-store.ll | 15 ++++++++ 3 files changed, 54 insertions(+), 29 deletions(-) create mode 100644 test/CodeGen/X86/fp-stack-ret-store.ll diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 67eb2ce1a54..46f31164d5d 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -1636,24 +1636,3 @@ a stride-4 IV, would would allow all the scales in the loop to go away. This would result in smaller code and more efficient microops. //===---------------------------------------------------------------------===// - -We should be smarter about conversion from fpstack to XMM regs. - -double foo(); -void bar(double *P) { *P = foo(); } - -We compile that to: - -_bar: - subl $12, %esp - call L_foo$stub - fstpl (%esp) - movl 16(%esp), %eax - movsd (%esp), %xmm0 - movsd %xmm0, (%eax) - addl $12, %esp - ret - -for example. The magic to/from the stack is unneeded. - -//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 79aaaebb01b..598536d8e35 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -33,7 +33,6 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SSARegMap.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/SmallSet.h" @@ -812,7 +811,6 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); - SmallVector ResultVals; // Copy all of the result registers out of their specified physreg. @@ -838,17 +836,50 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, // an XMM register. if ((X86ScalarSSEf32 && RVLocs[0].getValVT() == MVT::f32) || (X86ScalarSSEf64 && RVLocs[0].getValVT() == MVT::f64)) { + SDOperand StoreLoc; + const Value *SrcVal = 0; + int SrcValOffset = 0; + + // Determine where to store the value. If the call result is directly + // used by a store, see if we can store directly into the location. In + // this case, we'll end up producing a fst + movss[load] + movss[store] to + // the same location, and the two movss's will be nuked as dead. This + // optimizes common things like "*D = atof(..)" to not need an + // intermediate stack slot. + if (SDOperand(TheCall, 0).hasOneUse() && + SDOperand(TheCall, 1).hasOneUse()) { + // Ok, we have one use of the value and one use of the chain. See if + // they are the same node: a store. + if (StoreSDNode *N = dyn_cast(*TheCall->use_begin())) { + if (N->getChain().Val == TheCall && N->getValue().Val == TheCall && + !N->isVolatile() && !N->isTruncatingStore() && + N->getAddressingMode() == ISD::UNINDEXED) { + StoreLoc = N->getBasePtr(); + SrcVal = N->getSrcValue(); + SrcValOffset = N->getSrcValueOffset(); + } + } + } + + // If we weren't able to optimize the result, just create a temporary + // stack slot. + if (StoreLoc.Val == 0) { + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); + StoreLoc = DAG.getFrameIndex(SSFI, getPointerTy()); + } + // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This // shouldn't be necessary except that RFP cannot be live across - // multiple blocks. When stackifier is fixed, they can be uncoupled. - MachineFunction &MF = DAG.getMachineFunction(); - int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); - SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + // multiple blocks (which could happen if a select gets lowered into + // multiple blocks and scheduled in between them). When stackifier is + // fixed, they can be uncoupled. SDOperand Ops[] = { - Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag + Chain, RetVal, StoreLoc, DAG.getValueType(RVLocs[0].getValVT()), InFlag }; Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5); - RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0); + RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, + StoreLoc, SrcVal, SrcValOffset); Chain = RetVal.getValue(1); } ResultVals.push_back(RetVal); diff --git a/test/CodeGen/X86/fp-stack-ret-store.ll b/test/CodeGen/X86/fp-stack-ret-store.ll new file mode 100644 index 00000000000..87829649a31 --- /dev/null +++ b/test/CodeGen/X86/fp-stack-ret-store.ll @@ -0,0 +1,15 @@ +; RUN: llvm-as < %s | llc | not grep movss +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i686-apple-darwin8" + +; This should store directly into P from the FP stack. It should not +; go through a stack slot to get there. + +define void @bar(double* %P) { +entry: + %tmp = tail call double (...)* @foo( ) ; [#uses=1] + store double %tmp, double* %P, align 8 + ret void +} + +declare double @foo(...)