diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f14179603eb..614b84c392c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -4423,6 +4423,25 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); } +static bool isPartialRegisterLoad(const MachineInstr &LoadMI, + const MachineFunction &MF) { + unsigned Opc = LoadMI.getOpcode(); + unsigned RegSize = + MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); + + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) + // These instructions only load 32 bits, we can't fold them if the + // destination register is wider than 32 bits (4 bytes). + return true; + + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) + // These instructions only load 64 bits, we can't fold them if the + // destination register is wider than 64 bits (8 bytes). + return true; + + return false; +} + MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl &Ops, @@ -4430,8 +4449,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); int FrameIndex; - if (isLoadFromStackSlot(LoadMI, FrameIndex)) + if (isLoadFromStackSlot(LoadMI, FrameIndex)) { + if (isPartialRegisterLoad(*LoadMI, MF)) + return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); + } // Check switch flag if (NoFusing) return nullptr; @@ -4542,19 +4564,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } default: { - if ((LoadMI->getOpcode() == X86::MOVSSrm || - LoadMI->getOpcode() == X86::VMOVSSrm) && - MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() - > 4) - // These instructions only load 32 bits, we can't fold them if the - // destination register is wider than 32 bits (4 bytes). - return nullptr; - if ((LoadMI->getOpcode() == X86::MOVSDrm || - LoadMI->getOpcode() == X86::VMOVSDrm) && - MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() - > 8) - // These instructions only load 64 bits, we can't fold them if the - // destination register is wider than 64 bits (8 bytes). + if (isPartialRegisterLoad(*LoadMI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. diff --git a/test/CodeGen/X86/peephole-fold-movsd.ll b/test/CodeGen/X86/peephole-fold-movsd.ll new file mode 100644 index 00000000000..cb0dfce938b --- /dev/null +++ b/test/CodeGen/X86/peephole-fold-movsd.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=x86-64 < %s | FileCheck %s +; +; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into +; addpd. +; rdar://problem/18236850 + +%struct.S1 = type { double, double } + +@g = common global %struct.S1 zeroinitializer, align 8 + +declare void @foo3(%struct.S1*) + +; CHECK: movsd (%rsp), [[R0:%xmm[0-9]+]] +; CHECK: addpd [[R0]], %xmm{{[0-9]+}} + +define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) { + %1 = alloca <2 x double>, align 16 + %tmpcast = bitcast <2 x double>* %1 to %struct.S1* + call void @foo3(%struct.S1* %tmpcast) #2 + %p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0 + %2 = load double* %p2, align 16 + %p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1 + %3 = load double* %p3, align 8 + %4 = insertelement <2 x double> undef, double %2, i32 0 + %5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1 + %6 = insertelement <2 x double> undef, double %3, i32 1 + %7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0 + %8 = fadd <2 x double> %5, %7 + store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16 + ret void +}