diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td index d48164d6216..222760a0cb9 100644 --- a/lib/Target/PowerPC/PPCCallingConv.td +++ b/lib/Target/PowerPC/PPCCallingConv.td @@ -31,13 +31,18 @@ def RetCC_PPC : CallingConv<[ CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>, CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, + + // Floating point types returned as "direct" go into F1 .. F8; note that + // only the ELFv2 ABI fully utilizes all these registers. + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, - CCIfType<[f32], CCAssignToReg<[F1, F2]>>, - CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>, - - // Vector types are always returned in V2. - CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>, - CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>> + // Vector types returned as "direct" go into V2 .. V9; note that only the + // ELFv2 ABI fully utilizes all these registers. + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>> ]>; @@ -69,10 +74,12 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[i32], CCPromoteToType>, CCIfType<[i64], CCAssignToReg<[X3, X4]>>, CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, - CCIfType<[f32], CCAssignToReg<[F1, F2]>>, - CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>, - CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>, - CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>> + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>> ]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index fb40d5e204e..d699e0fdd82 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2158,14 +2158,19 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, unsigned ArgSize = ArgVT.getStoreSize(); if (Flags.isByVal()) ArgSize = Flags.getByValSize(); - ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; return ArgSize; } /// CalculateStackSlotAlignment - Calculates the alignment of this argument /// on the stack. -static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, unsigned PtrByteSize) { unsigned Align = PtrByteSize; @@ -2187,6 +2192,17 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, } } + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + return Align; } @@ -2194,7 +2210,8 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. -static bool CalculateStackSlotUsed(EVT ArgVT, ISD::ArgFlagsTy Flags, +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, unsigned PtrByteSize, unsigned LinkageSize, unsigned ParamAreaSize, @@ -2204,7 +2221,8 @@ static bool CalculateStackSlotUsed(EVT ArgVT, ISD::ArgFlagsTy Flags, bool UseMemory = false; // Respect alignment of argument on the stack. - unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize); + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; // If there's no space left in the argument save area, we must // use memory (this check also catches zero-sized arguments). @@ -2213,6 +2231,8 @@ static bool CalculateStackSlotUsed(EVT ArgVT, ISD::ArgFlagsTy Flags, // Allocate argument on the stack. ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; // If we overran the argument save area, we must use memory // (this check catches arguments passed partially in memory) if (ArgOffset > LinkageSize + ParamAreaSize) @@ -2563,7 +2583,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned AvailableFPRs = Num_FPR_Regs; unsigned AvailableVRs = Num_VR_Regs; for (unsigned i = 0, e = Ins.size(); i != e; ++i) - if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].Flags, + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, NumBytes, AvailableFPRs, AvailableVRs)) HasParameterArea = true; @@ -2581,6 +2601,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue ArgVal; bool needsLoad = false; EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; unsigned ObjSize = ObjectVT.getStoreSize(); unsigned ArgSize = ObjSize; ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; @@ -2589,7 +2610,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( /* Respect alignment of argument on the stack. */ unsigned Align = - CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize); + CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; @@ -2701,6 +2722,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != Num_GPR_Regs) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); @@ -2718,6 +2742,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. if (FPR_idx != Num_FPR_Regs) { unsigned VReg; @@ -2730,12 +2757,32 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs) { + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); } else { needsLoad = true; - ArgSize = PtrByteSize; } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; case MVT::v4f32: case MVT::v4i32: @@ -2743,6 +2790,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : @@ -4105,12 +4155,16 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, for (unsigned i = 0; i != NumOps; ++i) { ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; /* Respect alignment of argument on the stack. */ - unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize); + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); NumBytes = ((NumBytes + Align - 1) / Align) * Align; NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; } unsigned NumBytesActuallyUsed = NumBytes; @@ -4187,10 +4241,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, for (unsigned i = 0; i != NumOps; ++i) { SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; /* Respect alignment of argument on the stack. */ unsigned Align = - CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize); + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; /* Compute GPR index associated with argument offset. */ @@ -4330,6 +4386,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i1: case MVT::i32: case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. if (GPR_idx != NumGPRs) { RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { @@ -4340,39 +4399,70 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ArgOffset += PtrByteSize; break; case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); - if (isVarArg) { - // A single float or an aggregate containing only a single float - // must be passed right-justified in the stack doubleword, and - // in the GPR, if one is available. - SDValue StoreOff; - if (Arg.getSimpleValueType().SimpleTy == MVT::f32 && - !isLittleEndian) { - SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); - StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); - } else - StoreOff = PtrOff; + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs) { + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; - SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff, - MachinePointerInfo(), false, false, 0); - MemOpChains.push_back(Store); + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); - // Float varargs are always shadowed in available integer registers - if (GPR_idx != NumGPRs) { - SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, - MachinePointerInfo(), false, false, - false, 0); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); - } - } + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. - if (Arg.getValueType() == MVT::f32 && !isLittleEndian) { + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); } @@ -4381,14 +4471,25 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, true, isTailCall, false, MemOpChains, TailCallArguments, dl); } - ArgOffset += 8; + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; break; + } case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + // For a varargs call, named arguments go into VRs or on the stack as // usual; unnamed arguments always go to the stack or the corresponding // GPRs when within range. For now, we always put the value in both diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index df05aa51ffa..ae8c300a4ff 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -510,6 +510,20 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const override; + /// \brief Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override { + // We support any array type as "consecutive" block in the parameter + // save area. The element type defines the alignment requirement and + // whether the argument should go in GPRs, FPRs, or VRs if available. + // + // Note that clang uses this capability both to implement the ELFv2 + // homogeneous float/vector aggregate ABI, and to avoid having to use + // "byval" when passing aggregates that might fully fit in registers. + return Ty->isArrayTy(); + } + private: SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll new file mode 100644 index 00000000000..9eed623baca --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll @@ -0,0 +1,329 @@ +; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; +; Verify use of registers for float/vector aggregate return. +; + +define [8 x float] @return_float([8 x float] %x) { +entry: + ret [8 x float] %x +} +; CHECK-LABEL: @return_float +; CHECK: %entry +; CHECK-NEXT: blr + +define [8 x double] @return_double([8 x double] %x) { +entry: + ret [8 x double] %x +} +; CHECK-LABEL: @return_double +; CHECK: %entry +; CHECK-NEXT: blr + +define [4 x ppc_fp128] @return_ppcf128([4 x ppc_fp128] %x) { +entry: + ret [4 x ppc_fp128] %x +} +; CHECK-LABEL: @return_ppcf128 +; CHECK: %entry +; CHECK-NEXT: blr + +define [8 x <4 x i32>] @return_v4i32([8 x <4 x i32>] %x) { +entry: + ret [8 x <4 x i32>] %x +} +; CHECK-LABEL: @return_v4i32 +; CHECK: %entry +; CHECK-NEXT: blr + + +; +; Verify amount of space taken up by aggregates in the parameter save area. +; + +define i64 @callee_float([7 x float] %a, [7 x float] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_float +; CHECK: ld 3, 96(1) +; CHECK: blr + +define void @caller_float(i64 %x, [7 x float] %y) { +entry: + tail call void @test_float([7 x float] %y, [7 x float] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_float +; CHECK: std 3, 96(1) +; CHECK: bl test_float + +declare void @test_float([7 x float], [7 x float], i64) + +define i64 @callee_double(i64 %a, [7 x double] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_double +; CHECK: ld 3, 96(1) +; CHECK: blr + +define void @caller_double(i64 %x, [7 x double] %y) { +entry: + tail call void @test_double(i64 %x, [7 x double] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_double +; CHECK: std 3, 96(1) +; CHECK: bl test_double + +declare void @test_double(i64, [7 x double], i64) + +define i64 @callee_ppcf128(i64 %a, [4 x ppc_fp128] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_ppcf128 +; CHECK: ld 3, 104(1) +; CHECK: blr + +define void @caller_ppcf128(i64 %x, [4 x ppc_fp128] %y) { +entry: + tail call void @test_ppcf128(i64 %x, [4 x ppc_fp128] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_ppcf128 +; CHECK: std 3, 104(1) +; CHECK: bl test_ppcf128 + +declare void @test_ppcf128(i64, [4 x ppc_fp128], i64) + +define i64 @callee_i64(i64 %a, [7 x i64] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_i64 +; CHECK: ld 3, 96(1) +; CHECK: blr + +define void @caller_i64(i64 %x, [7 x i64] %y) { +entry: + tail call void @test_i64(i64 %x, [7 x i64] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_i64 +; CHECK: std 3, 96(1) +; CHECK: bl test_i64 + +declare void @test_i64(i64, [7 x i64], i64) + +define i64 @callee_i128(i64 %a, [4 x i128] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_i128 +; CHECK: ld 3, 112(1) +; CHECK: blr + +define void @caller_i128(i64 %x, [4 x i128] %y) { +entry: + tail call void @test_i128(i64 %x, [4 x i128] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_i128 +; CHECK: std 3, 112(1) +; CHECK: bl test_i128 + +declare void @test_i128(i64, [4 x i128], i64) + +define i64 @callee_v4i32(i64 %a, [4 x <4 x i32>] %b, i64 %c) { +entry: + ret i64 %c +} +; CHECK-LABEL: @callee_v4i32 +; CHECK: ld 3, 112(1) +; CHECK: blr + +define void @caller_v4i32(i64 %x, [4 x <4 x i32>] %y) { +entry: + tail call void @test_v4i32(i64 %x, [4 x <4 x i32>] %y, i64 %x) + ret void +} +; CHECK-LABEL: @caller_v4i32 +; CHECK: std 3, 112(1) +; CHECK: bl test_v4i32 + +declare void @test_v4i32(i64, [4 x <4 x i32>], i64) + + +; +; Verify handling of floating point arguments in GPRs +; + +%struct.float8 = type { [8 x float] } +%struct.float5 = type { [5 x float] } +%struct.float2 = type { [2 x float] } + +@g8 = common global %struct.float8 zeroinitializer, align 4 +@g5 = common global %struct.float5 zeroinitializer, align 4 +@g2 = common global %struct.float2 zeroinitializer, align 4 + +define float @callee0([7 x float] %a, [7 x float] %b) { +entry: + %b.extract = extractvalue [7 x float] %b, 6 + ret float %b.extract +} +; CHECK-LABEL: @callee0 +; CHECK: stw 10, [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller0([7 x float] %a) { +entry: + tail call void @test0([7 x float] %a, [7 x float] %a) + ret void +} +; CHECK-LABEL: @caller0 +; CHECK-DAG: fmr 8, 1 +; CHECK-DAG: fmr 9, 2 +; CHECK-DAG: fmr 10, 3 +; CHECK-DAG: fmr 11, 4 +; CHECK-DAG: fmr 12, 5 +; CHECK-DAG: fmr 13, 6 +; CHECK-DAG: stfs 7, [[OFF:[0-9]+]](1) +; CHECK-DAG: lwz 10, [[OFF]](1) +; CHECK: bl test0 + +declare void @test0([7 x float], [7 x float]) + +define float @callee1([8 x float] %a, [8 x float] %b) { +entry: + %b.extract = extractvalue [8 x float] %b, 7 + ret float %b.extract +} +; CHECK-LABEL: @callee1 +; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32 +; CHECK: stw [[REG]], [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller1([8 x float] %a) { +entry: + tail call void @test1([8 x float] %a, [8 x float] %a) + ret void +} +; CHECK-LABEL: @caller1 +; CHECK-DAG: fmr 9, 1 +; CHECK-DAG: fmr 10, 2 +; CHECK-DAG: fmr 11, 3 +; CHECK-DAG: fmr 12, 4 +; CHECK-DAG: fmr 13, 5 +; CHECK-DAG: stfs 5, [[OFF0:[0-9]+]](1) +; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1) +; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1) +; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1) +; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1) +; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1) +; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1) +; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1) +; CHECK-DAG: sldi [[REG1]], [[REG1]], 32 +; CHECK-DAG: sldi [[REG3]], [[REG3]], 32 +; CHECK-DAG: or 9, [[REG0]], [[REG1]] +; CHECK-DAG: or 10, [[REG2]], [[REG3]] +; CHECK: bl test1 + +declare void @test1([8 x float], [8 x float]) + +define float @callee2([8 x float] %a, [5 x float] %b, [2 x float] %c) { +entry: + %c.extract = extractvalue [2 x float] %c, 1 + ret float %c.extract +} +; CHECK-LABEL: @callee2 +; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32 +; CHECK: stw [[REG]], [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller2() { +entry: + %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4 + %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4 + %2 = load [2 x float]* getelementptr inbounds (%struct.float2* @g2, i64 0, i32 0), align 4 + tail call void @test2([8 x float] %0, [5 x float] %1, [2 x float] %2) + ret void +} +; CHECK-LABEL: @caller2 +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK-DAG: lfs 1, 0([[REG]]) +; CHECK-DAG: lfs 2, 4([[REG]]) +; CHECK-DAG: lfs 3, 8([[REG]]) +; CHECK-DAG: lfs 4, 12([[REG]]) +; CHECK-DAG: lfs 5, 16([[REG]]) +; CHECK-DAG: lfs 6, 20([[REG]]) +; CHECK-DAG: lfs 7, 24([[REG]]) +; CHECK-DAG: lfs 8, 28([[REG]]) +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK-DAG: lfs 9, 0([[REG]]) +; CHECK-DAG: lfs 10, 4([[REG]]) +; CHECK-DAG: lfs 11, 8([[REG]]) +; CHECK-DAG: lfs 12, 12([[REG]]) +; CHECK-DAG: lfs 13, 16([[REG]]) +; CHECK: ld [[REG:[0-9]+]], .LC +; CHECK-DAG: lwz [[REG0:[0-9]+]], 0([[REG]]) +; CHECK-DAG: lwz [[REG1:[0-9]+]], 4([[REG]]) +; CHECK-DAG: sldi [[REG1]], [[REG1]], 32 +; CHECK-DAG: or 10, [[REG0]], [[REG1]] +; CHECK: bl test2 + +declare void @test2([8 x float], [5 x float], [2 x float]) + +define double @callee3([8 x float] %a, [5 x float] %b, double %c) { +entry: + ret double %c +} +; CHECK-LABEL: @callee3 +; CHECK: std 10, [[OFF:.*]](1) +; CHECK: lfd 1, [[OFF]](1) +; CHECK: blr + +define void @caller3(double %d) { +entry: + %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4 + %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4 + tail call void @test3([8 x float] %0, [5 x float] %1, double %d) + ret void +} +; CHECK-LABEL: @caller3 +; CHECK: stfd 1, [[OFF:.*]](1) +; CHECK: ld 10, [[OFF]](1) +; CHECK: bl test3 + +declare void @test3([8 x float], [5 x float], double) + +define float @callee4([8 x float] %a, [5 x float] %b, float %c) { +entry: + ret float %c +} +; CHECK-LABEL: @callee4 +; CHECK: stw 10, [[OFF:.*]](1) +; CHECK: lfs 1, [[OFF]](1) +; CHECK: blr + +define void @caller4(float %f) { +entry: + %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4 + %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4 + tail call void @test4([8 x float] %0, [5 x float] %1, float %f) + ret void +} +; CHECK-LABEL: @caller4 +; CHECK: stfs 1, [[OFF:.*]](1) +; CHECK: lwz 10, [[OFF]](1) +; CHECK: bl test4 + +declare void @test4([8 x float], [5 x float], float) + diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll index fb1835f580b..0fd9fc50892 100644 --- a/test/CodeGen/PowerPC/varargs-struct-float.ll +++ b/test/CodeGen/PowerPC/varargs-struct-float.ll @@ -16,8 +16,8 @@ entry: ret void } -; CHECK: stfs {{[0-9]+}}, 60(1) -; CHECK: ld 4, 56(1) +; CHECK: stfs {{[0-9]+}}, 116(1) +; CHECK: lwz 4, 116(1) ; CHECK: bl declare void @testvaSf1(i32, ...)