diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 7afdb734ec2..b2e9c15b68b 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -102,11 +102,11 @@ static cl::opt AvgIPC( #ifndef NDEBUG namespace { // For sched=list-ilp, Count the number of times each factor comes into play. - enum { FactPressureDiff, FactRegUses, FactStall, FactHeight, FactDepth, - FactStatic, FactOther, NumFactors }; + enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactStatic, + FactOther, NumFactors }; } static const char *FactorName[NumFactors] = -{"PressureDiff", "RegUses", "Stall", "Height", "Depth","Static", "Other"}; +{"PressureDiff", "RegUses", "Height", "Depth","Static", "Other"}; static int FactorCount[NumFactors]; #endif //!NDEBUG @@ -463,13 +463,6 @@ void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) { if (DisableSchedCycles) return; - // FIXME: Nodes such as CopyFromReg probably should not advance the current - // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node - // has predecessors the cycle will be advanced when they are scheduled. - // But given the crude nature of modeling latency though such nodes, we - // currently need to treat these nodes like real instructions. - // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return; - unsigned ReadyCycle = isBottomUp ? SU->getHeight() : SU->getDepth(); // Bump CurCycle to account for latency. We assume the latency of other @@ -540,19 +533,16 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) { } } -static void resetVRegCycle(SUnit *SU); - /// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending /// count of its predecessors. If a predecessor pending count is zero, add it to /// the Available queue. void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { - DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: "); + DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: "); DEBUG(SU->dump(this)); #ifndef NDEBUG if (CurCycle < SU->getHeight()) - DEBUG(dbgs() << " Height [" << SU->getHeight() - << "] pipeline stall!\n"); + DEBUG(dbgs() << " Height [" << SU->getHeight() << "] pipeline stall!\n"); #endif // FIXME: Do not modify node height. It may interfere with @@ -569,7 +559,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { AvailableQueue->ScheduledNode(SU); // If HazardRec is disabled, and each inst counts as one cycle, then - // advance CurCycle before ReleasePredecessors to avoid useless pushes to + // advance CurCycle before ReleasePredecessors to avoid useles pushed to // PendingQueue for schedulers that implement HasReadyFilter. if (!HazardRec->isEnabled() && AvgIPC < 2) AdvanceToCycle(CurCycle + 1); @@ -590,25 +580,20 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { } } - resetVRegCycle(SU); - SU->isScheduled = true; // Conditions under which the scheduler should eagerly advance the cycle: // (1) No available instructions // (2) All pipelines full, so available instructions must have hazards. // - // If HazardRec is disabled, the cycle was pre-advanced before calling - // ReleasePredecessors. In that case, IssueCount should remain 0. + // If HazardRec is disabled, the cycle was advanced earlier. // // Check AvailableQueue after ReleasePredecessors in case of zero latency. - if (HazardRec->isEnabled() || AvgIPC > 1) { - if (SU->getNode() && SU->getNode()->isMachineOpcode()) - ++IssueCount; - if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) - || (!HazardRec->isEnabled() && IssueCount == AvgIPC)) - AdvanceToCycle(CurCycle + 1); - } + ++IssueCount; + if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) + || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC) + || AvailableQueue->empty()) + AdvanceToCycle(CurCycle + 1); } /// CapturePred - This does the opposite of ReleasePred. Since SU is being @@ -1235,7 +1220,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() { // priority. If it is not ready put it back. Schedule the node. Sequence.reserve(SUnits.size()); while (!AvailableQueue->empty()) { - DEBUG(dbgs() << "Examining Available:\n"; + DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue->dump(this)); // Pick the best node to schedule taking all constraints into @@ -1676,6 +1661,17 @@ void RegReductionPQBase::CalculateSethiUllmanNumbers() { CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers); } +void RegReductionPQBase::initNodes(std::vector &sunits) { + SUnits = &sunits; + // Add pseudo dependency edges for two-address nodes. + AddPseudoTwoAddrDeps(); + // Reroute edges to nodes with multiple uses. + if (!TracksRegPressure) + PrescheduleNodesWithMultipleUses(); + // Calculate node priorities. + CalculateSethiUllmanNumbers(); +} + void RegReductionPQBase::addNode(const SUnit *SU) { unsigned SUSize = SethiUllmanNumbers.size(); if (SUnits->size() > SUSize) @@ -2012,29 +2008,7 @@ static unsigned calcMaxScratches(const SUnit *SU) { return Scratches; } -/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are -/// CopyFromReg from a virtual register. -static bool hasOnlyLiveInOpers(const SUnit *SU) { - bool RetVal = false; - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl()) continue; - const SUnit *PredSU = I->getSUnit(); - if (PredSU->getNode() && - PredSU->getNode()->getOpcode() == ISD::CopyFromReg) { - unsigned Reg = - cast(PredSU->getNode()->getOperand(1))->getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - RetVal = true; - continue; - } - } - return false; - } - return RetVal; -} - -/// hasOnlyLiveOutUses - Return true if SU has only value successors that are +/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a /// CopyToReg to a virtual register. This SU def is probably a liveout and /// it has no other use. It should be scheduled closer to the terminator. static bool hasOnlyLiveOutUses(const SUnit *SU) { @@ -2056,71 +2030,62 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) { return RetVal; } -// Set isVRegCycle for a node with only live in opers and live out uses. Also -// set isVRegCycle for its CopyFromReg operands. -// -// This is only relevant for single-block loops, in which case the VRegCycle -// node is likely an induction variable in which the operand and target virtual -// registers should be coalesced (e.g. pre/post increment values). Setting the -// isVRegCycle flag helps the scheduler prioritize other uses of the same -// CopyFromReg so that this node becomes the virtual register "kill". This -// avoids interference between the values live in and out of the block and -// eliminates a copy inside the loop. -static void initVRegCycle(SUnit *SU) { - if (DisableSchedVRegCycle) - return; - - if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU)) - return; - - DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n"); - - SU->isVRegCycle = true; - - for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); - I != E; ++I) { - if (I->isCtrl()) continue; - I->getSUnit()->isVRegCycle = true; - } -} - -// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of -// CopyFromReg operands. We should no longer penalize other uses of this VReg. -static void resetVRegCycle(SUnit *SU) { - if (!SU->isVRegCycle) - return; - - for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end(); +/// UnitsSharePred - Return true if the two scheduling units share a common +/// data predecessor. +static bool UnitsSharePred(const SUnit *left, const SUnit *right) { + SmallSet Preds; + for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end(); I != E; ++I) { if (I->isCtrl()) continue; // ignore chain preds - SUnit *PredSU = I->getSUnit(); - if (PredSU->isVRegCycle) { - assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg && - "VRegCycle def must be CopyFromReg"); - I->getSUnit()->isVRegCycle = 0; - } + Preds.insert(I->getSUnit()); } -} - -// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This -// means a node that defines the VRegCycle has not been scheduled yet. -static bool hasVRegCycleUse(const SUnit *SU) { - // If this SU also defines the VReg, don't hoist it as a "use". - if (SU->isVRegCycle) - return false; - - for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end(); + for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end(); I != E; ++I) { if (I->isCtrl()) continue; // ignore chain preds - if (I->getSUnit()->isVRegCycle && - I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) { - DEBUG(dbgs() << " VReg cycle use: SU (" << SU->NodeNum << ")\n"); + if (Preds.count(I->getSUnit())) return true; + } + return false; +} + +// Return true if the virtual register defined by VRCycleSU may interfere with +// VRUseSU. +// +// Note: We may consider two SU's that use the same value live into a loop as +// interferng even though the value is not an induction variable. This is an +// unfortunate consequence of scheduling on the selection DAG. +static bool checkVRegCycleInterference(const SUnit *VRCycleSU, + const SUnit *VRUseSU) { + for (SUnit::const_pred_iterator I = VRCycleSU->Preds.begin(), + E = VRCycleSU->Preds.end(); I != E; ++I) { + if (I->isCtrl()) continue; // ignore chain preds + SDNode *InNode = I->getSUnit()->getNode(); + if (!InNode || InNode->getOpcode() != ISD::CopyFromReg) + continue; + for (SUnit::const_pred_iterator II = VRUseSU->Preds.begin(), + EE = VRUseSU->Preds.end(); II != EE; ++II) { + if (II->getSUnit() == I->getSUnit()) + return true; } } return false; } +// Compare the VRegCycle properties of the nodes. +// Return -1 if left has higher priority, 1 if right has higher priority. +// Return 0 if priority is equivalent. +static int BUCompareVRegCycle(const SUnit *left, const SUnit *right) { + if (left->isVRegCycle && !right->isVRegCycle) { + if (checkVRegCycleInterference(left, right)) + return -1; + } + else if (!left->isVRegCycle && right->isVRegCycle) { + if (checkVRegCycleInterference(right, left)) + return 1; + } + return 0; +} + // Check for either a dependence (latency) or resource (hazard) stall. // // Note: The ScheduleHazardRecognizer interface requires a non-const SU. @@ -2136,12 +2101,23 @@ static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) { // Return 0 if latency-based priority is equivalent. static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, RegReductionPQBase *SPQ) { - // Scheduling an instruction that uses a VReg whose postincrement has not yet - // been scheduled will induce a copy. Model this as an extra cycle of latency. - int LPenalty = hasVRegCycleUse(left) ? 1 : 0; - int RPenalty = hasVRegCycleUse(right) ? 1 : 0; - int LHeight = (int)left->getHeight() + LPenalty; - int RHeight = (int)right->getHeight() + RPenalty; + // If the two nodes share an operand and one of them has a single + // use that is a live out copy, favor the one that is live out. Otherwise + // it will be difficult to eliminate the copy if the instruction is a + // loop induction variable update. e.g. + // BB: + // sub r1, r3, #1 + // str r0, [r2, r3] + // mov r3, r1 + // cmp + // bne BB + bool SharePred = UnitsSharePred(left, right); + // FIXME: Only adjust if BB is a loop back edge. + // FIXME: What's the cost of a copy? + int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0; + int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0; + int LHeight = (int)left->getHeight() - LBonus; + int RHeight = (int)right->getHeight() - RBonus; bool LStall = (!checkPref || left->SchedulingPref == Sched::Latency) && BUHasStall(left, LHeight, SPQ); @@ -2152,47 +2128,36 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, // If scheduling either one of the node will cause a pipeline stall, sort // them according to their height. if (LStall) { - if (!RStall) { - DEBUG(++FactorCount[FactStall]); + if (!RStall) return 1; - } - if (LHeight != RHeight) { - DEBUG(++FactorCount[FactStall]); + if (LHeight != RHeight) return LHeight > RHeight ? 1 : -1; - } - } else if (RStall) { - DEBUG(++FactorCount[FactStall]); + } else if (RStall) return -1; - } // If either node is scheduling for latency, sort them by height/depth // and latency. if (!checkPref || (left->SchedulingPref == Sched::Latency || right->SchedulingPref == Sched::Latency)) { if (DisableSchedCycles) { - if (LHeight != RHeight) { - DEBUG(++FactorCount[FactHeight]); + if (LHeight != RHeight) return LHeight > RHeight ? 1 : -1; - } } else { // If neither instruction stalls (!LStall && !RStall) then // its height is already covered so only its depth matters. We also reach // this if both stall but have the same height. - int LDepth = left->getDepth() - LPenalty; - int RDepth = right->getDepth() - RPenalty; + unsigned LDepth = left->getDepth(); + unsigned RDepth = right->getDepth(); if (LDepth != RDepth) { - DEBUG(++FactorCount[FactDepth]); DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum << ") depth " << LDepth << " vs SU (" << right->NodeNum << ") depth " << RDepth << "\n"); return LDepth < RDepth ? 1 : -1; } } - if (left->Latency != right->Latency) { - DEBUG(++FactorCount[FactOther]); + if (left->Latency != right->Latency) return left->Latency > right->Latency ? 1 : -1; - } } return 0; } @@ -2204,19 +2169,7 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { DEBUG(++FactorCount[FactStatic]); return LPriority > RPriority; } - else if(LPriority == 0) { - // Schedule zero-latency TokenFactor below any other special - // nodes. The alternative may be to avoid artificially boosting the - // TokenFactor's height when it is scheduled, but we currently rely on an - // instruction's final height to equal the cycle in which it is scheduled, - // so heights are monotonically increasing. - unsigned LOpc = left->getNode() ? left->getNode()->getOpcode() : 0; - unsigned ROpc = right->getNode() ? right->getNode()->getOpcode() : 0; - if (LOpc == ISD::TokenFactor) - return false; - if (ROpc == ISD::TokenFactor) - return true; - } + DEBUG(++FactorCount[FactOther]); // Try schedule def + use closer when Sethi-Ullman numbers are the same. // e.g. @@ -2237,18 +2190,14 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { // This creates more short live intervals. unsigned LDist = closestSucc(left); unsigned RDist = closestSucc(right); - if (LDist != RDist) { - DEBUG(++FactorCount[FactOther]); + if (LDist != RDist) return LDist < RDist; - } // How many registers becomes live when the node is scheduled. unsigned LScratch = calcMaxScratches(left); unsigned RScratch = calcMaxScratches(right); - if (LScratch != RScratch) { - DEBUG(++FactorCount[FactOther]); + if (LScratch != RScratch) return LScratch > RScratch; - } if (!DisableSchedCycles) { int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ); @@ -2256,20 +2205,15 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { return result > 0; } else { - if (left->getHeight() != right->getHeight()) { - DEBUG(++FactorCount[FactHeight]); + if (left->getHeight() != right->getHeight()) return left->getHeight() > right->getHeight(); - } - if (left->getDepth() != right->getDepth()) { - DEBUG(++FactorCount[FactDepth]); + if (left->getDepth() != right->getDepth()) return left->getDepth() < right->getDepth(); - } } assert(left->NodeQueueId && right->NodeQueueId && "NodeQueueId cannot be zero"); - DEBUG(++FactorCount[FactOther]); return (left->NodeQueueId > right->NodeQueueId); } @@ -2320,22 +2264,24 @@ bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { // Avoid causing spills. If register pressure is high, schedule for // register pressure reduction. if (LHigh && !RHigh) { - DEBUG(++FactorCount[FactPressureDiff]); DEBUG(dbgs() << " pressure SU(" << left->NodeNum << ") > SU(" << right->NodeNum << ")\n"); return true; } else if (!LHigh && RHigh) { - DEBUG(++FactorCount[FactPressureDiff]); DEBUG(dbgs() << " pressure SU(" << right->NodeNum << ") > SU(" << left->NodeNum << ")\n"); return false; } - if (!LHigh && !RHigh) { - int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ); - if (result != 0) - return result > 0; + int result = 0; + if (!DisableSchedVRegCycle) { + result = BUCompareVRegCycle(left, right); } + if (result == 0 && !LHigh && !RHigh) { + result = BUCompareLatency(left, right, true /*checkPref*/, SPQ); + } + if (result != 0) + return result > 0; return BURRSort(left, right, SPQ); } @@ -2401,6 +2347,12 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { if (RReduce && !LReduce) return true; } + if (!DisableSchedVRegCycle) { + int result = BUCompareVRegCycle(left, right); + if (result != 0) + return result > 0; + } + if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) { DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n"); @@ -2439,24 +2391,6 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { return BURRSort(left, right, SPQ); } -void RegReductionPQBase::initNodes(std::vector &sunits) { - SUnits = &sunits; - // Add pseudo dependency edges for two-address nodes. - AddPseudoTwoAddrDeps(); - // Reroute edges to nodes with multiple uses. - if (!TracksRegPressure) - PrescheduleNodesWithMultipleUses(); - // Calculate node priorities. - CalculateSethiUllmanNumbers(); - - // For single block loops, mark nodes that look like canonical IV increments. - if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) { - for (unsigned i = 0, e = sunits.size(); i != e; ++i) { - initVRegCycle(&sunits[i]); - } - } -} - //===----------------------------------------------------------------------===// // Preschedule for Register Pressure //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index fe853c349f9..24a1937c445 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -342,6 +342,10 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); + // Set isVRegCycle if the node operands are live into and value is live out + // of a single block loop. + InitVRegCycleFlag(NodeSUnit); + // Compute NumRegDefsLeft. This must be done before AddSchedEdges. InitNumRegDefsLeft(NodeSUnit); @@ -412,13 +416,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() { PhysReg = 0; // If this is a ctrl dep, latency is 1. - // Special-case TokenFactor chains as zero-latency. - unsigned OpLatency = 1; - if (!isChain && OpSU->Latency > 0) - OpLatency = OpSU->Latency; - else if(isChain && OpN->getOpcode() == ISD::TokenFactor) - OpLatency = 0; - + unsigned OpLatency = isChain ? 1 : OpSU->Latency; const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data, OpLatency, PhysReg); if (!isChain && !UnitLatencies) { @@ -514,6 +512,47 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() { } } +// Set isVRegCycle if this node's single use is CopyToReg and its only active +// data operands are CopyFromReg. +// +// This is only relevant for single-block loops, in which case the VRegCycle +// node is likely an induction variable in which the operand and target virtual +// registers should be coalesced (e.g. pre/post increment values). Setting the +// isVRegCycle flag helps the scheduler prioritize other uses of the same +// CopyFromReg so that this node becomes the virtual register "kill". This +// avoids interference between the values live in and out of the block and +// eliminates a copy inside the loop. +void ScheduleDAGSDNodes::InitVRegCycleFlag(SUnit *SU) { + if (!BB->isSuccessor(BB)) + return; + + SDNode *N = SU->getNode(); + if (N->getGluedNode()) + return; + + if (!N->hasOneUse() || N->use_begin()->getOpcode() != ISD::CopyToReg) + return; + + bool FoundLiveIn = false; + for (SDNode::op_iterator OI = N->op_begin(), E = N->op_end(); OI != E; ++OI) { + EVT OpVT = OI->getValueType(); + assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); + + if (OpVT == MVT::Other) + continue; // ignore chain operands + + if (isPassiveNode(OI->getNode())) + continue; // ignore constants and such + + if (OI->getNode()->getOpcode() != ISD::CopyFromReg) + return; + + FoundLiveIn = true; + } + if (FoundLiveIn) + SU->isVRegCycle = true; +} + void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) { assert(SU->NumRegDefsLeft == 0 && "expect a new node"); for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) { diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index 5bae037cafb..e8a2a3b7d5b 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -1,8 +1,10 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s +; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s +; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=basic -disable-post-ra | FileCheck %s ; The ARM magic hinting works best with linear scan. -; CHECK: ldrd -; CHECK: strd +; CHECK: ldmia +; CHECK: stmia +; CHECK: ldrh ; CHECK: ldrb %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll index 49a69827bc0..390457fc21b 100644 --- a/test/CodeGen/ARM/vfp.ll +++ b/test/CodeGen/ARM/vfp.ll @@ -40,8 +40,8 @@ define void @test_add(float* %P, double* %D) { define void @test_ext_round(float* %P, double* %D) { ;CHECK: test_ext_round: %a = load float* %P ; [#uses=1] -;CHECK: vcvt.f64.f32 ;CHECK: vcvt.f32.f64 +;CHECK: vcvt.f64.f32 %b = fpext float %a to double ; [#uses=1] %A = load double* %D ; [#uses=1] %B = fptrunc double %A to float ; [#uses=1]