diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index d8793782228..33ed4cc907a 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -36,6 +36,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -56,11 +57,14 @@ STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address"); STATISTIC(Num3AddrSunk, "Number of 3-address instructions sunk"); STATISTIC(NumReMats, "Number of instructions re-materialized"); STATISTIC(NumDeletes, "Number of dead instructions deleted"); +STATISTIC(NumReSchedUps, "Number of instructions re-scheduled up"); +STATISTIC(NumReSchedDowns, "Number of instructions re-scheduled down"); namespace { class TwoAddressInstructionPass : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + const InstrItineraryData *InstrItins; MachineRegisterInfo *MRI; LiveVariables *LV; AliasAnalysis *AA; @@ -120,6 +124,18 @@ namespace { MachineBasicBlock::iterator &nmi, MachineFunction::iterator &mbbi, unsigned Dist); + bool isDefTooClose(unsigned Reg, unsigned Dist, + MachineInstr *MI, MachineBasicBlock *MBB); + + bool RescheduleMIBelowKill(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &mi, + MachineBasicBlock::iterator &nmi, + unsigned Reg); + bool RescheduleKillAboveMI(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &mi, + MachineBasicBlock::iterator &nmi, + unsigned Reg); + bool TryInstructionTransform(MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi, MachineFunction::iterator &mbbi, @@ -467,6 +483,32 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) { return false; } +/// findLocalKill - Look for an instruction below MI in the MBB that kills the +/// specified register. Returns null if there are any other Reg use between the +/// instructions. +static +MachineInstr *findLocalKill(unsigned Reg, MachineBasicBlock *MBB, + MachineInstr *MI, MachineRegisterInfo *MRI, + DenseMap &DistanceMap) { + MachineInstr *KillMI = 0; + for (MachineRegisterInfo::use_nodbg_iterator + UI = MRI->use_nodbg_begin(Reg), + UE = MRI->use_nodbg_end(); UI != UE; ++UI) { + MachineInstr *UseMI = &*UI; + if (UseMI == MI || UseMI->getParent() != MBB) + continue; + DenseMap::iterator DI = DistanceMap.find(UseMI); + if (DI != DistanceMap.end()) + continue; + if (!UI.getOperand().isKill()) + return 0; + assert(!KillMI && "More than one local kills?"); + KillMI = UseMI; + } + + return KillMI; +} + /// findOnlyInterestingUse - Given a register, if has a single in-basic block /// use, return the use instruction if it's a copy or a two-address use. static @@ -852,6 +894,285 @@ TwoAddressInstructionPass::DeleteUnusedInstr(MachineBasicBlock::iterator &mi, return true; } +/// RescheduleMIBelowKill - If there is one more local instruction that reads +/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill +/// instruction in order to eliminate the need for the copy. +bool +TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &mi, + MachineBasicBlock::iterator &nmi, + unsigned Reg) { + MachineInstr *MI = &*mi; + DenseMap::iterator DI = DistanceMap.find(MI); + if (DI == DistanceMap.end()) + // Must be created from unfolded load. Don't waste time trying this. + return false; + + MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap); + if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike()) + // Don't mess with copies, they may be coalesced later. + return false; + + const MCInstrDesc &MCID = KillMI->getDesc(); + if (MCID.hasUnmodeledSideEffects() || MCID.isCall() || MCID.isBranch() || + MCID.isTerminator()) + // Don't move pass calls, etc. + return false; + + unsigned DstReg; + if (isTwoAddrUse(*KillMI, Reg, DstReg)) + return false; + + bool SeenStore; + if (!MI->isSafeToMove(TII, AA, SeenStore)) + return false; + + if (TII->getInstrLatency(InstrItins, MI) > 1) + // FIXME: Needs more sophisticated heuristics. + return false; + + SmallSet Uses; + SmallSet Defs; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned MOReg = MO.getReg(); + if (!MOReg) + continue; + if (MO.isDef()) + Defs.insert(MOReg); + else + Uses.insert(MOReg); + } + + // Move the copies connected to MI down as well. + MachineBasicBlock::iterator From = MI; + MachineBasicBlock::iterator To = llvm::next(From); + while (To->isCopy() && Defs.count(To->getOperand(1).getReg())) { + Defs.insert(To->getOperand(0).getReg()); + ++To; + } + + // Check if the reschedule will not break depedencies. + unsigned NumVisited = 0; + MachineBasicBlock::iterator KillPos = KillMI; + ++KillPos; + for (MachineBasicBlock::iterator I = To; I != KillPos; ++I) { + MachineInstr *OtherMI = I; + // DBG_VALUE cannot be counted against the limit. + if (OtherMI->isDebugValue()) + continue; + if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost. + return false; + ++NumVisited; + const MCInstrDesc &OMCID = OtherMI->getDesc(); + if (OMCID.hasUnmodeledSideEffects() || OMCID.isCall() || OMCID.isBranch() || + OMCID.isTerminator()) + // Don't move pass calls, etc. + return false; + for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = OtherMI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned MOReg = MO.getReg(); + if (!MOReg) + continue; + if (MO.isDef()) { + if (Uses.count(MOReg)) + // Physical register use would be clobbered. + return false; + if (!MO.isDead() && Defs.count(MOReg)) + // May clobber a physical register def. + // FIXME: This may be too conservative. It's ok if the instruction + // is sunken completely below the use. + return false; + } else { + if (Defs.count(MOReg)) + return false; + if (MOReg != Reg && MO.isKill() && Uses.count(MOReg)) + // Don't want to extend other live ranges and update kills. + return false; + } + } + } + + // Move debug info as well. + if (From != MBB->begin()) { + while (llvm::prior(From)->isDebugValue()) + --From; + } + + // Copies following MI may have been moved as well. + nmi = To; + MBB->splice(KillPos, MBB, From, To); + DistanceMap.erase(DI); + + if (LV) { + // Update live variables + LV->removeVirtualRegisterKilled(Reg, KillMI); + LV->addVirtualRegisterKilled(Reg, MI); + } else { + for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = KillMI->getOperand(i); + if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) + continue; + MO.setIsKill(false); + } + MI->addRegisterKilled(Reg, 0); + } + + return true; +} + +/// isDefTooClose - Return true if the re-scheduling will put the given +/// instruction too close to the defs of its register dependencies. +bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist, + MachineInstr *MI, + MachineBasicBlock *MBB) { + for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(Reg), + DE = MRI->def_end(); DI != DE; ++DI) { + MachineInstr *DefMI = &*DI; + if (DefMI->getParent() != MBB || DefMI->isCopy() || DefMI->isCopyLike()) + continue; + if (DefMI == MI) + return true; // MI is defining something KillMI uses + DenseMap::iterator DDI = DistanceMap.find(DefMI); + if (DDI == DistanceMap.end()) + return true; // Below MI + unsigned DefDist = DDI->second; + assert(Dist > DefDist && "Visited def already?"); + if (TII->getInstrLatency(InstrItins, DefMI) > (int)(Dist - DefDist)) + return true; + } + return false; +} + +/// RescheduleKillAboveMI - If there is one more local instruction that reads +/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the +/// current two-address instruction in order to eliminate the need for the +/// copy. +bool +TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &mi, + MachineBasicBlock::iterator &nmi, + unsigned Reg) { + MachineInstr *MI = &*mi; + DenseMap::iterator DI = DistanceMap.find(MI); + if (DI == DistanceMap.end()) + // Must be created from unfolded load. Don't waste time trying this. + return false; + + MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap); + if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike()) + // Don't mess with copies, they may be coalesced later. + return false; + + unsigned DstReg; + if (isTwoAddrUse(*KillMI, Reg, DstReg)) + return false; + + bool SeenStore; + if (!KillMI->isSafeToMove(TII, AA, SeenStore)) + return false; + + SmallSet Uses; + SmallSet Kills; + SmallSet Defs; + SmallSet LiveDefs; + for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = KillMI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned MOReg = MO.getReg(); + if (MO.isUse()) { + if (!MOReg) + continue; + if (isDefTooClose(MOReg, DI->second, MI, MBB)) + return false; + Uses.insert(MOReg); + if (MO.isKill() && MOReg != Reg) + Kills.insert(MOReg); + } else if (TargetRegisterInfo::isPhysicalRegister(MOReg)) { + Defs.insert(MOReg); + if (!MO.isDead()) + LiveDefs.insert(MOReg); + } + } + + // Check if the reschedule will not break depedencies. + unsigned NumVisited = 0; + MachineBasicBlock::iterator KillPos = KillMI; + for (MachineBasicBlock::iterator I = mi; I != KillPos; ++I) { + MachineInstr *OtherMI = I; + // DBG_VALUE cannot be counted against the limit. + if (OtherMI->isDebugValue()) + continue; + if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost. + return false; + ++NumVisited; + const MCInstrDesc &MCID = OtherMI->getDesc(); + if (MCID.hasUnmodeledSideEffects() || MCID.isCall() || MCID.isBranch() || + MCID.isTerminator()) + // Don't move pass calls, etc. + return false; + for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = OtherMI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned MOReg = MO.getReg(); + if (!MOReg) + continue; + if (MO.isUse()) { + if (Defs.count(MOReg)) + // Moving KillMI can clobber the physical register if the def has + // not been seen. + return false; + if (Kills.count(MOReg)) + // Don't want to extend other live ranges and update kills. + return false; + } else { + if (Uses.count(MOReg)) + return false; + if (TargetRegisterInfo::isPhysicalRegister(MOReg) && + LiveDefs.count(MOReg)) + return false; + // Physical register def is seen. + Defs.erase(MOReg); + } + } + } + + // Move the old kill above MI, don't forget to move debug info as well. + MachineBasicBlock::iterator InsertPos = mi; + if (InsertPos != MBB->begin()) + while (llvm::prior(InsertPos)->isDebugValue()) + --InsertPos; + MachineBasicBlock::iterator From = KillMI; + MachineBasicBlock::iterator To = llvm::next(From); + while (llvm::prior(From)->isDebugValue()) + --From; + MBB->splice(InsertPos, MBB, From, To); + + nmi = llvm::prior(mi); // Backtrack so we process the moved instruction. + DistanceMap.erase(DI); + + if (LV) { + // Update live variables + LV->removeVirtualRegisterKilled(Reg, KillMI); + LV->addVirtualRegisterKilled(Reg, MI); + } else { + for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = KillMI->getOperand(i); + if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg) + continue; + MO.setIsKill(false); + } + MI->addRegisterKilled(Reg, 0); + } + return true; +} + /// TryInstructionTransform - For the case where an instruction has a single /// pair of tied register operands, attempt some transformations that may /// either eliminate the tied operands or improve the opportunities for @@ -863,17 +1184,18 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, MachineFunction::iterator &mbbi, unsigned SrcIdx, unsigned DstIdx, unsigned Dist, SmallPtrSet &Processed) { - const MCInstrDesc &MCID = mi->getDesc(); - unsigned regA = mi->getOperand(DstIdx).getReg(); - unsigned regB = mi->getOperand(SrcIdx).getReg(); + MachineInstr &MI = *mi; + const MCInstrDesc &MCID = MI.getDesc(); + unsigned regA = MI.getOperand(DstIdx).getReg(); + unsigned regB = MI.getOperand(SrcIdx).getReg(); assert(TargetRegisterInfo::isVirtualRegister(regB) && "cannot make instruction into two-address form"); // If regA is dead and the instruction can be deleted, just delete // it so it doesn't clobber regB. - bool regBKilled = isKilled(*mi, regB, MRI, TII); - if (!regBKilled && mi->getOperand(DstIdx).isDead() && + bool regBKilled = isKilled(MI, regB, MRI, TII); + if (!regBKilled && MI.getOperand(DstIdx).isDead() && DeleteUnusedInstr(mi, nmi, mbbi, Dist)) { ++NumDeletes; return true; // Done with this instruction. @@ -885,20 +1207,20 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, unsigned regCIdx = ~0U; bool TryCommute = false; bool AggressiveCommute = false; - if (MCID.isCommutable() && mi->getNumOperands() >= 3 && - TII->findCommutedOpIndices(mi, SrcOp1, SrcOp2)) { + if (MCID.isCommutable() && MI.getNumOperands() >= 3 && + TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) { if (SrcIdx == SrcOp1) regCIdx = SrcOp2; else if (SrcIdx == SrcOp2) regCIdx = SrcOp1; if (regCIdx != ~0U) { - regC = mi->getOperand(regCIdx).getReg(); - if (!regBKilled && isKilled(*mi, regC, MRI, TII)) + regC = MI.getOperand(regCIdx).getReg(); + if (!regBKilled && isKilled(MI, regC, MRI, TII)) // If C dies but B does not, swap the B and C operands. // This makes the live ranges of A and C joinable. TryCommute = true; - else if (isProfitableToCommute(regB, regC, mi, mbbi, Dist)) { + else if (isProfitableToCommute(regB, regC, &MI, mbbi, Dist)) { TryCommute = true; AggressiveCommute = true; } @@ -913,6 +1235,13 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, return false; } + // If there is one more use of regB later in the same MBB, consider + // re-schedule this MI below it. + if (RescheduleMIBelowKill(mbbi, mi, nmi, regB)) { + ++NumReSchedDowns; + return true; + } + if (TargetRegisterInfo::isVirtualRegister(regA)) ScanUses(regA, &*mbbi, Processed); @@ -928,6 +1257,13 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, } } + // If there is one more use of regB later in the same MBB, consider + // re-schedule it before this MI if it's legal. + if (RescheduleKillAboveMI(mbbi, mi, nmi, regB)) { + ++NumReSchedUps; + return true; + } + // If this is an instruction with a load folded into it, try unfolding // the load, e.g. avoid this: // movq %rdx, %rcx @@ -940,7 +1276,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, // Determine if a load can be unfolded. unsigned LoadRegIndex; unsigned NewOpc = - TII->getOpcodeAfterMemoryUnfold(mi->getOpcode(), + TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(), /*UnfoldLoad=*/true, /*UnfoldStore=*/false, &LoadRegIndex); @@ -950,12 +1286,12 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, MachineFunction &MF = *mbbi->getParent(); // Unfold the load. - DEBUG(dbgs() << "2addr: UNFOLDING: " << *mi); + DEBUG(dbgs() << "2addr: UNFOLDING: " << MI); const TargetRegisterClass *RC = TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI); unsigned Reg = MRI->createVirtualRegister(RC); SmallVector NewMIs; - if (!TII->unfoldMemoryOperand(MF, mi, Reg, + if (!TII->unfoldMemoryOperand(MF, &MI, Reg, /*UnfoldLoad=*/true,/*UnfoldStore=*/false, NewMIs)) { DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n"); @@ -986,21 +1322,21 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, // Success, or at least we made an improvement. Keep the unfolded // instructions and discard the original. if (LV) { - for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) { - MachineOperand &MO = mi->getOperand(i); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { if (MO.isUse()) { if (MO.isKill()) { if (NewMIs[0]->killsRegister(MO.getReg())) - LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[0]); + LV->replaceKillInstruction(MO.getReg(), &MI, NewMIs[0]); else { assert(NewMIs[1]->killsRegister(MO.getReg()) && "Kill missing after load unfold!"); - LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[1]); + LV->replaceKillInstruction(MO.getReg(), &MI, NewMIs[1]); } } - } else if (LV->removeVirtualRegisterDead(MO.getReg(), mi)) { + } else if (LV->removeVirtualRegisterDead(MO.getReg(), &MI)) { if (NewMIs[1]->registerDefIsDead(MO.getReg())) LV->addVirtualRegisterDead(MO.getReg(), NewMIs[1]); else { @@ -1013,7 +1349,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi, } LV->addVirtualRegisterKilled(Reg, NewMIs[1]); } - mi->eraseFromParent(); + MI.eraseFromParent(); mi = NewMIs[1]; if (TransformSuccess) return true; @@ -1040,6 +1376,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = TM.getInstrInfo(); TRI = TM.getRegisterInfo(); + InstrItins = TM.getInstrItineraryData(); LV = getAnalysisIfAvailable(); AA = &getAnalysis(); diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll index 4a6f5316a68..7f2bd75560c 100644 --- a/test/CodeGen/X86/iv-users-in-other-loops.ll +++ b/test/CodeGen/X86/iv-users-in-other-loops.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=x86-64 -enable-lsr-nested -o %t ; RUN: not grep inc %t ; RUN: grep dec %t | count 2 -; RUN: grep addq %t | count 10 +; RUN: grep addq %t | count 12 ; RUN: not grep addb %t ; RUN: not grep leal %t ; RUN: not grep movq %t diff --git a/test/CodeGen/X86/lsr-reuse-trunc.ll b/test/CodeGen/X86/lsr-reuse-trunc.ll index 5f5e0937a3b..1f87089f80e 100644 --- a/test/CodeGen/X86/lsr-reuse-trunc.ll +++ b/test/CodeGen/X86/lsr-reuse-trunc.ll @@ -4,14 +4,13 @@ ; Full strength reduction wouldn't reduce register pressure, so LSR should ; stick with indexing here. -; FIXME: This is worse off from disabling of scheduler 2-address hack. ; CHECK: movaps (%{{rsi|rdx}},%rax,4), [[X3:%xmm[0-9]+]] -; CHECK: leaq 4(%rax), %{{rcx|r9}} ; CHECK: cvtdq2ps ; CHECK: orps {{%xmm[0-9]+}}, [[X4:%xmm[0-9]+]] ; CHECK: movaps [[X4]], (%{{rdi|rcx}},%rax,4) -; CHECK: cmpl %{{ecx|r9d}}, (%{{rdx|r8}}) -; CHECK: jg +; CHECK: addq $4, %rax +; CHECK: cmpl %eax, (%{{rdx|r8}}) +; CHECK-NEXT: jg define void @vvfloorf(float* nocapture %y, float* nocapture %x, i32* nocapture %n) nounwind { entry: diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index 291069d4625..5ea1b4dff1c 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -164,12 +164,12 @@ define internal void @t10() nounwind { store <4 x i16> %6, <4 x i16>* @g2, align 8 ret void ; X64: t10: -; X64: pextrw $4, [[X0:%xmm[0-9]+]], %eax -; X64: movlhps [[X1:%xmm[0-9]+]] -; X64: pshuflw $8, [[X1]], [[X2:%xmm[0-9]+]] -; X64: pinsrw $2, %eax, [[X2]] +; X64: pextrw $4, [[X0:%xmm[0-9]+]], %ecx ; X64: pextrw $6, [[X0]], %eax -; X64: pinsrw $3, %eax, [[X2]] +; X64: movlhps [[X0]], [[X0]] +; X64: pshuflw $8, [[X0]], [[X0]] +; X64: pinsrw $2, %ecx, [[X0]] +; X64: pinsrw $3, %eax, [[X0]] } @@ -232,10 +232,9 @@ entry: %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp8 ; X64: t15: -; X64: movdqa %xmm0, %xmm2 +; X64: pextrw $7, %xmm0, %eax ; X64: punpcklqdq %xmm1, %xmm0 ; X64: pshuflw $-128, %xmm0, %xmm0 -; X64: pextrw $7, %xmm2, %eax ; X64: pinsrw $2, %eax, %xmm0 ; X64: ret } @@ -248,12 +247,11 @@ entry: %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > ret <16 x i8> %tmp9 ; X64: t16: -; X64: movdqa %xmm1, %xmm2 -; X64: pslldq $2, %xmm2 -; X64: movd %xmm2, %eax -; X64: pinsrw $0, %eax, %xmm0 -; X64: pextrw $8, %xmm1, %eax -; X64: pextrw $1, %xmm2, %ecx +; X64: pextrw $8, %xmm0, %eax +; X64: pslldq $2, %xmm0 +; X64: movd %xmm0, %ecx +; X64: pextrw $1, %xmm0, %edx +; X64: pinsrw $0, %ecx, %xmm0 ; X64: ret }