Make musttail more robust for vector types on x86

Previously I tried to plug musttail into the existing vararg lowering code. That turned out to be a mistake, because non-vararg calls use significantly different register lowering, even on x86. For example, AVX vectors are usually passed in registers to normal functions and memory to vararg functions. Now musttail uses a completely separate lowering. Hopefully this can be used as the basis for non-x86 perfect forwarding. Reviewers: majnemer Differential Revision: http://reviews.llvm.org/D6156 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224745 91177308-0d34-0410-b5e6-96231b3b80d8
2025-09-30 22:18:46 +00:00 · 2014-12-22 23:58:37 +00:00
parent b39244dca3
commit 34b7fde802
6 changed files with 299 additions and 97 deletions
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -158,6 +158,16 @@ public:
  }
 };
 /// Describes a register that needs to be forwarded from the prologue to a
 /// musttail call.
 struct ForwardedRegister {
  ForwardedRegister(unsigned VReg, MCPhysReg PReg, MVT VT)
      : VReg(VReg), PReg(PReg), VT(VT) {}
  unsigned VReg;
  MCPhysReg PReg;
  MVT VT;
 };
 /// CCAssignFn - This function assigns a location for Val, updating State to
 /// reflect the change.  It returns 'true' if it failed to handle Val.
 typedef bool CCAssignFn(unsigned ValNo, MVT ValVT,
@@ -470,6 +480,19 @@ public:
    return PendingLocs;
  }
  /// Compute the remaining unused register parameters that would be used for
  /// the given value type. This is useful when varargs are passed in the
  /// registers that normal prototyped parameters would be passed in, or for
  /// implementing perfect forwarding.
  void getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, MVT VT,
                                   CCAssignFn Fn);
  /// Compute the set of registers that need to be preserved and forwarded to
  /// any musttail calls.
  void analyzeMustTailForwardedRegisters(
      SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
      CCAssignFn Fn);
 private:
  /// MarkAllocated - Mark a register and all of its aliases as allocated.
  void MarkAllocated(unsigned Reg);
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -14,9 +14,11 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -178,3 +180,57 @@ void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
    llvm_unreachable(nullptr);
  }
 }
 void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
                                          MVT VT, CCAssignFn Fn) {
  unsigned SavedStackOffset = StackOffset;
  unsigned NumLocs = Locs.size();
  // Allocate something of this value type repeatedly with just the inreg flag
  // set until we get assigned a location in memory.
  ISD::ArgFlagsTy Flags;
  Flags.setInReg();
  bool HaveRegParm = true;
  while (HaveRegParm) {
    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
 #ifndef NDEBUG
      dbgs() << "Call has unhandled type " << EVT(VT).getEVTString()
             << " while computing remaining regparms\n";
 #endif
      llvm_unreachable(nullptr);
    }
    HaveRegParm = Locs.back().isRegLoc();
  }
  // Copy all the registers from the value locations we added.
  assert(NumLocs < Locs.size() && "CC assignment failed to add location");
  for (unsigned I = NumLocs, E = Locs.size(); I != E; ++I)
    if (Locs[I].isRegLoc())
      Regs.push_back(MCPhysReg(Locs[I].getLocReg()));
  // Clear the assigned values and stack memory. We leave the registers marked
  // as allocated so that future queries don't return the same registers, i.e.
  // when i64 and f64 are both passed in GPRs.
  StackOffset = SavedStackOffset;
  Locs.resize(NumLocs);
 }
 void CCState::analyzeMustTailForwardedRegisters(
    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
    CCAssignFn Fn) {
  // Oftentimes calling conventions will not user register parameters for
  // variadic functions, so we need to assume we're not variadic so that we get
  // all the registers that might be used in a non-variadic call.
  SaveAndRestore<bool> SavedVarArg(IsVarArg, false);
  for (MVT RegVT : RegParmTypes) {
    SmallVector<MCPhysReg, 8> RemainingRegs;
    getRemainingRegParmsForType(RemainingRegs, RegVT, Fn);
    const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
    const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
    for (MCPhysReg PReg : RemainingRegs) {
      unsigned VReg = MF.addLiveIn(PReg, RC);
      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
    }
  }
 }
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2549,11 +2549,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
        MFI->CreateFixedObject(1, StackSize, true));
  }
  // Figure out if XMM registers are in use.
  bool HaveXMMArgs = Is64Bit && !IsWin64;
  bool NoImplicitFloatOps = Fn->getAttributes().hasAttribute(
      AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
  assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
         "SSE register cannot be used when SSE is disabled!");
  if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
      !Subtarget->hasSSE1())
    HaveXMMArgs = false;
  // 64-bit calling conventions support varargs and register parameters, so we
-  // have to do extra work to spill them in the prologue or forward them to
+  // have to do extra work to spill them in the prologue.
-  // musttail calls.
+  if (Is64Bit && isVarArg && MFI->hasVAStart()) {
  if (Is64Bit && isVarArg &&
      (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
    // Find the first unallocated argument registers.
    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
@@ -2583,8 +2591,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      }
    }
    // Store them to the va_list returned by va_start.
    if (MFI->hasVAStart()) {
    if (IsWin64) {
      const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
      // Get to the caller-allocated home save location.  Add 8 to account
@@ -2639,34 +2645,45 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    if (!MemOps.empty())
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
    } else {
      // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
      // to the liveout set on a musttail call.
      assert(MFI->hasMustTailInVarArgFunc());
      auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
      typedef X86MachineFunctionInfo::Forward Forward;
      for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
        unsigned VReg =
            MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
        Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
        Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
  }
-      if (!ArgXMMs.empty()) {
+  if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
-        unsigned ALVReg =
+    // Find the largest legal vector type.
-            MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
+    MVT VecVT = MVT::Other;
-        Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
+    // FIXME: Only some x86_32 calling conventions support AVX512.
-        Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
+    if (Subtarget->hasAVX512() &&
        (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
                     CallConv == CallingConv::Intel_OCL_BI)))
      VecVT = MVT::v16f32;
    else if (Subtarget->hasAVX())
      VecVT = MVT::v8f32;
    else if (Subtarget->hasSSE2())
      VecVT = MVT::v4f32;
-        for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
+    // We forward some GPRs and some vector types.
-          unsigned VReg =
+    SmallVector<MVT, 2> RegParmTypes;
-              MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
+    MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
-          Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
+    RegParmTypes.push_back(IntVT);
-          Forwards.push_back(
+    if (VecVT != MVT::Other)
-              Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
+      RegParmTypes.push_back(VecVT);
-        }
+
    // Compute the set of forwarded registers. The rest are scratch.
    SmallVectorImpl<ForwardedRegister> &Forwards =
        FuncInfo->getForwardedMustTailRegParms();
    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
    // Conservatively forward AL on x86_64, since it might be used for varargs.
    if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
    }
    // Copy all forwards from physical to virtual registers.
    for (ForwardedRegister &F : Forwards) {
      // FIXME: Can we use a less constrained schedule?
      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
      F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
      Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
    }
  }
@@ -2986,7 +3003,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
  }
-  if (Is64Bit && isVarArg && IsMustTail) {
+  if (isVarArg && IsMustTail) {
    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
    for (const auto &F : Forwards) {
      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include <vector>
@@ -77,21 +78,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
  unsigned NumLocalDynamics;
 public:
  /// Describes a register that needs to be forwarded from the prologue to a
  /// musttail call.
  struct Forward {
    Forward(unsigned VReg, MCPhysReg PReg, MVT VT)
        : VReg(VReg), PReg(PReg), VT(VT) {}
    unsigned VReg;
    MCPhysReg PReg;
    MVT VT;
  };
 private:
  /// ForwardedMustTailRegParms - A list of virtual and physical registers
  /// that must be forwarded to every musttail call.
-  std::vector<Forward> ForwardedMustTailRegParms;
+  SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 public:
  X86MachineFunctionInfo() : ForceFramePointer(false),
@@ -168,7 +158,7 @@ public:
  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
-  std::vector<Forward> &getForwardedMustTailRegParms() {
+  SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
    return ForwardedMustTailRegParms;
  }
 };
--- a/test/CodeGen/X86/musttail-fastcall.ll
+++ b/test/CodeGen/X86/musttail-fastcall.ll
@@ -0,0 +1,109 @@
 ; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2,+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 ; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2,+avx,+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
 ; While we don't support varargs with fastcall, we do support forwarding.
@asdf = internal constant [4 x i8] c"asdf"
 declare void @puts(i8*)
 define i32 @call_fast_thunk() {
  %r = call x86_fastcallcc i32 (...)* @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3)
  ret i32 %r
 }
 define x86_fastcallcc i32 @fast_thunk(...) {
  call void @puts(i8* getelementptr ([4 x i8]* @asdf, i32 0, i32 0))
  %r = musttail call x86_fastcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...)
  ret i32 %r
 }
 ; Check that we spill and fill around the call to puts.
 ; CHECK-LABEL: @fast_thunk@0:
 ; CHECK-DAG: movl %ecx, {{.*}}
 ; CHECK-DAG: movl %edx, {{.*}}
 ; CHECK: calll _puts
 ; CHECK-DAG: movl {{.*}}, %ecx
 ; CHECK-DAG: movl {{.*}}, %edx
 ; CHECK: jmp @fast_target@12
 define x86_fastcallcc i32 @fast_target(i32 inreg %a, i32 inreg %b, i32 %c) {
  %a0 = add i32 %a, %b
  %a1 = add i32 %a0, %c
  ret i32 %a1
 }
 ; Repeat the test for vectorcall, which has XMM registers.
 define i32 @call_vector_thunk() {
  %r = call x86_vectorcallcc i32 (...)* @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3)
  ret i32 %r
 }
 define x86_vectorcallcc i32 @vector_thunk(...) {
  call void @puts(i8* getelementptr ([4 x i8]* @asdf, i32 0, i32 0))
  %r = musttail call x86_vectorcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...)
  ret i32 %r
 }
 ; Check that we spill and fill SSE registers around the call to puts.
 ; CHECK-LABEL: vector_thunk@@0:
 ; CHECK-DAG: movl %ecx, {{.*}}
 ; CHECK-DAG: movl %edx, {{.*}}
 ; SSE2-DAG: movups %xmm0, {{.*}}
 ; SSE2-DAG: movups %xmm1, {{.*}}
 ; SSE2-DAG: movups %xmm2, {{.*}}
 ; SSE2-DAG: movups %xmm3, {{.*}}
 ; SSE2-DAG: movups %xmm4, {{.*}}
 ; SSE2-DAG: movups %xmm5, {{.*}}
 ; AVX-DAG: vmovups %ymm0, {{.*}}
 ; AVX-DAG: vmovups %ymm1, {{.*}}
 ; AVX-DAG: vmovups %ymm2, {{.*}}
 ; AVX-DAG: vmovups %ymm3, {{.*}}
 ; AVX-DAG: vmovups %ymm4, {{.*}}
 ; AVX-DAG: vmovups %ymm5, {{.*}}
 ; AVX512-DAG: vmovups %zmm0, {{.*}}
 ; AVX512-DAG: vmovups %zmm1, {{.*}}
 ; AVX512-DAG: vmovups %zmm2, {{.*}}
 ; AVX512-DAG: vmovups %zmm3, {{.*}}
 ; AVX512-DAG: vmovups %zmm4, {{.*}}
 ; AVX512-DAG: vmovups %zmm5, {{.*}}
 ; CHECK: calll _puts
 ; SSE2-DAG: movups {{.*}}, %xmm0
 ; SSE2-DAG: movups {{.*}}, %xmm1
 ; SSE2-DAG: movups {{.*}}, %xmm2
 ; SSE2-DAG: movups {{.*}}, %xmm3
 ; SSE2-DAG: movups {{.*}}, %xmm4
 ; SSE2-DAG: movups {{.*}}, %xmm5
 ; AVX-DAG: vmovups {{.*}}, %ymm0
 ; AVX-DAG: vmovups {{.*}}, %ymm1
 ; AVX-DAG: vmovups {{.*}}, %ymm2
 ; AVX-DAG: vmovups {{.*}}, %ymm3
 ; AVX-DAG: vmovups {{.*}}, %ymm4
 ; AVX-DAG: vmovups {{.*}}, %ymm5
 ; AVX512-DAG: vmovups {{.*}}, %zmm0
 ; AVX512-DAG: vmovups {{.*}}, %zmm1
 ; AVX512-DAG: vmovups {{.*}}, %zmm2
 ; AVX512-DAG: vmovups {{.*}}, %zmm3
 ; AVX512-DAG: vmovups {{.*}}, %zmm4
 ; AVX512-DAG: vmovups {{.*}}, %zmm5
 ; CHECK-DAG: movl {{.*}}, %ecx
 ; CHECK-DAG: movl {{.*}}, %edx
 ; CHECK: jmp vector_target@@12
 define x86_vectorcallcc i32 @vector_target(i32 inreg %a, i32 inreg %b, i32 %c) {
  %a0 = add i32 %a, %b
  %a1 = add i32 %a0, %c
  ret i32 %a1
 }
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -5,9 +5,16 @@
 ; pack. Doing a normal call will clobber all argument registers, and we will
 ; spill around it. A simple adjustment should not require any XMM spills.
 declare void @llvm.va_start(i8*) nounwind
 declare void(i8*, ...)* @get_f(i8* %this)
 define void @f_thunk(i8* %this, ...) {
  ; Use va_start so that we exercise the combination.
  %ap = alloca [4 x i8*], align 16
  %ap_i8 = bitcast [4 x i8*]* %ap to i8*
  call void @llvm.va_start(i8* %ap_i8)
  %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
  musttail call void (i8*, ...)* %fptr(i8* %this, ...)
  ret void