diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 9833441d76d..e76af695812 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2131,6 +2131,34 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, return ArgSize; } + +/// CalculateStackSlotAlignment - Calculates the alignment of this argument +/// on the stack. +static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned Align = PtrByteSize; + + // Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + Align = 16; + + // ByVal parameters are aligned as requested. + if (Flags.isByVal()) { + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > PtrByteSize) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + Align = BVAlign; + } + } + + return Align; +} + /// EnsureStackAlignment - Round stack frame size up from NumBytes to /// ensure minimum alignment required for target. static unsigned EnsureStackAlignment(const TargetMachine &Target, @@ -2422,8 +2450,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false); unsigned ArgOffset = LinkageSize; - // Area that is at least reserved in caller of this function. - unsigned MinReservedArea = ArgOffset; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -2445,7 +2471,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_FPR_Regs = 13; const unsigned Num_VR_Regs = array_lengthof(VR); - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; // Add DAG nodes to load the arguments or copy them out of registers. On // entry to a function on PPC, the arguments start after the linkage area, @@ -2464,16 +2490,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); CurArgIdx = Ins[ArgNo].OrigArgIndex; + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; unsigned CurArgOffset = ArgOffset; - // Altivec parameters are padded to a 16 byte boundary. - if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || - ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8 || - ObjectVT==MVT::v2f64 || ObjectVT==MVT::v2i64) - MinReservedArea = ((MinReservedArea+15)/16)*16; - - // Calculate min reserved area. - MinReservedArea += CalculateStackSlotSize(ObjectVT, Flags, PtrByteSize); + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); // FIXME the codegen can be much improved in some cases. // We do not have to keep everything in memory. @@ -2495,12 +2520,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( continue; } - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - CurArgOffset = ArgOffset; - } - // All aggregates smaller than 8 bytes must be passed right-justified. if (ObjSize < PtrByteSize && !isLittleEndian) CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize); @@ -2536,7 +2555,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( } MemOps.push_back(Store); - ++GPR_idx; } // Whether we copied from a register or not, advance the offset // into the parameter save area by a full doubleword. @@ -2581,8 +2599,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // PPC64 passes i8, i16, and i32 values in i64 registers. Promote // value to MVT::i64 and then truncate to the correct register size. ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); - - ++GPR_idx; } else { needsLoad = true; ArgSize = PtrByteSize; @@ -2592,11 +2608,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::f32: case MVT::f64: - // Every 8 bytes of argument space consumes one of the GPRs available for - // argument passing. - if (GPR_idx != Num_GPR_Regs) { - ++GPR_idx; - } if (FPR_idx != Num_FPR_Regs) { unsigned VReg; @@ -2622,12 +2633,6 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: - // Vectors are aligned to a 16-byte boundary in the argument save area. - while ((ArgOffset % 16) != 0) { - ArgOffset += PtrByteSize; - if (GPR_idx != Num_GPR_Regs) - GPR_idx++; - } if (VR_idx != Num_VR_Regs) { unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : @@ -2635,11 +2640,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); ++VR_idx; } else { - CurArgOffset = ArgOffset; needsLoad = true; } ArgOffset += 16; - GPR_idx = std::min(GPR_idx + 2, Num_GPR_Regs); break; } @@ -2658,7 +2661,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( } // Area that is at least reserved in the caller of this function. - MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); + unsigned MinReservedArea; + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); // Set the size that is at least reserved in caller of this function. Tail // call optimized functions' reserved stack space needs to be aligned so that @@ -2679,7 +2683,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( // If this function is vararg, store any remaining integer argument regs // to their spots on the stack so that they may be loaded by deferencing the // result of va_next. - for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx < Num_GPR_Regs; ++GPR_idx) { unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, @@ -3971,15 +3976,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, ISD::ArgFlagsTy Flags = Outs[i].Flags; EVT ArgVT = Outs[i].VT; - // Altivec parameters are padded to a 16 byte boundary. - if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || - ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || - ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) - NumBytes = ((NumBytes+15)/16)*16; + /* Respect alignment of argument on the stack. */ + unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize); + NumBytes = ((NumBytes + Align - 1) / Align) * Align; NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); } + unsigned NumBytesActuallyUsed = NumBytes; + // The prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if its varargs. // Because we cannot tell if this is needed on the caller side, we have to @@ -4023,7 +4028,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, // must be stored to our stack, and loaded into integer regs as well, if // any integer regs are available for argument passing. unsigned ArgOffset = LinkageSize; - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -4052,6 +4057,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); + // PtrOff will be used to store the current argument to the stack if a // register cannot be found for it. SDValue PtrOff; @@ -4083,15 +4097,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (Size == 0) continue; - unsigned BVAlign = Flags.getByValAlign(); - if (BVAlign > 8) { - if (BVAlign % PtrByteSize != 0) - llvm_unreachable( - "ByVal alignment is not a multiple of the pointer size"); - - ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign; - } - // All aggregates smaller than 8 bytes must be passed right-justified. if (Size==1 || Size==2 || Size==4) { EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); @@ -4100,7 +4105,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), VT, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); ArgOffset += PtrByteSize; continue; @@ -4162,7 +4167,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); // Done with this argument. ArgOffset += PtrByteSize; @@ -4195,7 +4200,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::i32: case MVT::i64: if (GPR_idx != NumGPRs) { - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); } else { LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, true, isTailCall, false, MemOpChains, @@ -4230,11 +4235,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, MachinePointerInfo(), false, false, false, 0); MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); } - } else if (GPR_idx != NumGPRs) - // If we have any FPRs remaining, we may also have GPRs remaining. - ++GPR_idx; + } } else { // Single-precision floating-point values are mapped to the // second (rightmost) word of the stack doubleword. @@ -4255,13 +4258,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, case MVT::v16i8: case MVT::v2f64: case MVT::v2i64: - // Vectors are aligned to a 16-byte boundary in the argument save area. - while (ArgOffset % 16 !=0) { - ArgOffset += PtrByteSize; - if (GPR_idx != NumGPRs) - GPR_idx++; - } - // For a varargs call, named arguments go into VRs or on the stack as // usual; unnamed arguments always go to the stack or the corresponding // GPRs when within range. For now, we always put the value in both @@ -4269,8 +4265,6 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, if (isVarArg) { // We could elide this store in the case where the object fits // entirely in R registers. Maybe later. - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, - DAG.getConstant(ArgOffset, PtrVT)); SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(), false, false, 0); MemOpChains.push_back(Store); @@ -4315,11 +4309,12 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, TailCallArguments, dl); } ArgOffset += 16; - GPR_idx = std::min(GPR_idx + 2, NumGPRs); break; } } + assert(NumBytesActuallyUsed == ArgOffset); + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); diff --git a/test/CodeGen/PowerPC/ppc64-byval-align.ll b/test/CodeGen/PowerPC/ppc64-byval-align.ll new file mode 100644 index 00000000000..1bbd6da85b2 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -0,0 +1,56 @@ +; RUN: llc -O1 < %s -march=ppc64 | FileCheck %s + +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +%struct.test = type { i64, [8 x i8] } +%struct.pad = type { [8 x i64] } + +@gt = common global %struct.test zeroinitializer, align 16 +@gp = common global %struct.pad zeroinitializer, align 8 + +define signext i32 @callee1(i32 signext %x, %struct.test* byval align 16 nocapture readnone %y, i32 signext %z) { +entry: + ret i32 %z +} +; CHECK-LABEL: @callee1 +; CHECK: mr 3, 7 +; CHECK: blr + +declare signext i32 @test1(i32 signext, %struct.test* byval align 16, i32 signext) +define void @caller1(i32 signext %z) { +entry: + %call = tail call signext i32 @test1(i32 signext 0, %struct.test* byval align 16 @gt, i32 signext %z) + ret void +} +; CHECK-LABEL: @caller1 +; CHECK: mr [[REG:[0-9]+]], 3 +; CHECK: mr 7, [[REG]] +; CHECK: bl test1 + +define i64 @callee2(%struct.pad* byval nocapture readnone %x, i32 signext %y, %struct.test* byval align 16 nocapture readonly %z) { +entry: + %x1 = getelementptr inbounds %struct.test* %z, i64 0, i32 0 + %0 = load i64* %x1, align 16 + ret i64 %0 +} +; CHECK-LABEL: @callee2 +; CHECK: ld [[REG:[0-9]+]], 128(1) +; CHECK: mr 3, [[REG]] +; CHECK: blr + +declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16) +define void @caller2(i64 %z) { +entry: + %tmp = alloca %struct.test, align 16 + %.compoundliteral.sroa.0.0..sroa_idx = getelementptr inbounds %struct.test* %tmp, i64 0, i32 0 + store i64 %z, i64* %.compoundliteral.sroa.0.0..sroa_idx, align 16 + %call = call i64 @test2(%struct.pad* byval @gp, i32 signext 0, %struct.test* byval align 16 %tmp) + ret void +} +; CHECK-LABEL: @caller2 +; CHECK: std 3, [[OFF:[0-9]+]](1) +; CHECK: ld [[REG:[0-9]+]], [[OFF]](1) +; CHECK: std [[REG]], 128(1) +; CHECK: bl test2 +