diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index f4b9039f482..90f10c1fccb 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -27,6 +27,7 @@ namespace llvm { bool NoExcessFPPrecision; int PatternISelTriState; bool UnsafeFPMath; + bool PICEnabled; }; namespace { cl::opt PrintCode("print-machineinstrs", @@ -52,6 +53,11 @@ namespace { cl::desc("Enable optimizations that may decrease FP precision"), cl::location(UnsafeFPMath), cl::init(false)); + cl::opt + EnablePIC("enable-pic", + cl::desc("Enable generation of position independant code"), + cl::location(PICEnabled), + cl::init(false)); }; //--------------------------------------------------------------------------- diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 1bb4ec2e9c3..2fc022af4b1 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -28,6 +28,7 @@ enum X86VectorEnum { }; extern X86VectorEnum X86Vector; +extern bool X86ScalarSSE; /// createX86SimpleInstructionSelector - This pass converts an LLVM function /// into a machine code representation in a very simple peep-hole fashion. The diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 874391dda9f..afa3ff7b71c 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -61,7 +61,7 @@ def IntelAsmWriter : AsmWriter { def X86 : Target { // Specify the callee saved registers. - let CalleeSavedRegisters = [ESI, EDI, EBX, EBP]; + let CalleeSavedRegisters = [ESI, EDI, EBX, EBP, XMM4, XMM5, XMM6, XMM7]; // Yes, pointers are 32-bits in size. let PointerType = i32; diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index d55edc5841c..789b8e26cef 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -361,8 +361,18 @@ void Emitter::emitInstruction(const MachineInstr &MI) { // Emit the repeat opcode prefix as needed. if ((Desc.TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3); - // Emit instruction prefixes if necessary - if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);// Operand size... + // Emit the operand size opcode prefix as needed. + if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66); + + // Emit the double precision sse fp opcode prefix as needed. + if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XD) { + MCE.emitByte(0xF2); MCE.emitByte(0x0F); + } + + // Emit the double precision sse fp opcode prefix as needed. + if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XS) { + MCE.emitByte(0xF3); MCE.emitByte(0x0F); + } switch (Desc.TSFlags & X86II::Op0Mask) { case X86II::TB: diff --git a/lib/Target/X86/X86ISelPattern.cpp b/lib/Target/X86/X86ISelPattern.cpp index bc728a7a41b..5c561aa1716 100644 --- a/lib/Target/X86/X86ISelPattern.cpp +++ b/lib/Target/X86/X86ISelPattern.cpp @@ -97,15 +97,13 @@ namespace { setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0 // Set up the register classes. + // FIXME: Eliminate these two classes when legalize can handle promotions + // well. + addRegisterClass(MVT::i1, X86::R8RegisterClass); addRegisterClass(MVT::i8, X86::R8RegisterClass); addRegisterClass(MVT::i16, X86::R16RegisterClass); addRegisterClass(MVT::i32, X86::R32RegisterClass); - addRegisterClass(MVT::f64, X86::RFPRegisterClass); - - // FIXME: Eliminate these two classes when legalize can handle promotions - // well. -/**/ addRegisterClass(MVT::i1, X86::R8RegisterClass); - + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); setOperationAction(ISD::BRCONDTWOWAY , MVT::Other, Expand); setOperationAction(ISD::MEMMOVE , MVT::Other, Expand); @@ -123,7 +121,7 @@ namespace { setOperationAction(ISD::CTPOP , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Expand); setOperationAction(ISD::CTLZ , MVT::i32 , Expand); - + setOperationAction(ISD::READIO , MVT::i1 , Expand); setOperationAction(ISD::READIO , MVT::i8 , Expand); setOperationAction(ISD::READIO , MVT::i16 , Expand); @@ -132,24 +130,47 @@ namespace { setOperationAction(ISD::WRITEIO , MVT::i8 , Expand); setOperationAction(ISD::WRITEIO , MVT::i16 , Expand); setOperationAction(ISD::WRITEIO , MVT::i32 , Expand); - - if (!UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f64 , Expand); - setOperationAction(ISD::FCOS , MVT::f64 , Expand); - } - + // These should be promoted to a larger select which is supported. -/**/ setOperationAction(ISD::SELECT , MVT::i1 , Promote); + setOperationAction(ISD::SELECT , MVT::i1 , Promote); setOperationAction(ISD::SELECT , MVT::i8 , Promote); - + + if (X86ScalarSSE) { + // Set up the FP register classes. + addRegisterClass(MVT::f32, X86::RXMMRegisterClass); + addRegisterClass(MVT::f64, X86::RXMMRegisterClass); + + setOperationAction(ISD::EXTLOAD, MVT::f32, Expand); + setOperationAction(ISD::ZEXTLOAD, MVT::f32, Expand); + + // We don't support sin/cos/sqrt/fmod + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FABS , MVT::f64, Expand); + setOperationAction(ISD::FNEG , MVT::f64, Expand); + setOperationAction(ISD::SREM , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FABS , MVT::f32, Expand); + setOperationAction(ISD::FNEG , MVT::f32, Expand); + setOperationAction(ISD::SREM , MVT::f32, Expand); + } else { + // Set up the FP register classes. + addRegisterClass(MVT::f64, X86::RFPRegisterClass); + + if (!UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f64 , Expand); + } + + addLegalFPImmediate(+0.0); // FLD0 + addLegalFPImmediate(+1.0); // FLD1 + addLegalFPImmediate(-0.0); // FLD0/FCHS + addLegalFPImmediate(-1.0); // FLD1/FCHS + } computeRegisterProperties(); - - addLegalFPImmediate(+0.0); // FLD0 - addLegalFPImmediate(+1.0); // FLD1 - addLegalFPImmediate(-0.0); // FLD0/FCHS - addLegalFPImmediate(-1.0); // FLD1/FCHS } - + // Return the number of bytes that a function should pop when it returns (in // addition to the space used by the return address). // @@ -400,7 +421,10 @@ X86TargetLowering::LowerCCCCallTo(SDOperand Chain, const Type *RetTy, RetVals.push_back(MVT::i32); break; case MVT::f32: - RetVals.push_back(MVT::f64); + if (X86ScalarSSE) + RetVals.push_back(MVT::f32); + else + RetVals.push_back(MVT::f64); break; case MVT::i64: RetVals.push_back(MVT::i32); @@ -805,7 +829,10 @@ X86TargetLowering::LowerFastCCCallTo(SDOperand Chain, const Type *RetTy, RetVals.push_back(MVT::i32); break; case MVT::f32: - RetVals.push_back(MVT::f64); + if (X86ScalarSSE) + RetVals.push_back(MVT::f32); + else + RetVals.push_back(MVT::f64); break; case MVT::i64: RetVals.push_back(MVT::i32); @@ -1041,6 +1068,8 @@ void ISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) { BuildMI(BB, X86::MOV32rr, 1, LI->second).addReg(LI->first); } else if (RC == X86::RFPRegisterClass) { BuildMI(BB, X86::FpMOV, 1, LI->second).addReg(LI->first); + } else if (RC == X86::RXMMRegisterClass) { + BuildMI(BB, X86::MOVAPDrr, 1, LI->second).addReg(LI->first); } else { assert(0 && "Unknown regclass!"); } @@ -1641,6 +1670,11 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT, /*missing*/0, /*missing*/0, X86::FCMOVB , X86::FCMOVBE, X86::FCMOVA , X86::FCMOVAE, X86::FCMOVP , X86::FCMOVNP }; + static const unsigned SSE_CMOVTAB[] = { + 0 /* CMPEQSS */, 4 /* CMPNEQSS */, 1 /* CMPLTSS */, 2 /* CMPLESS */, + 2 /* CMPLESS */, 1 /* CMPLTSS */, /*missing*/0, /*missing*/0, + /*missing*/0, /*missing*/0, /*missing*/0, /*missing*/0 + }; if (SetCCSDNode *SetCC = dyn_cast(Cond)) { if (MVT::isInteger(SetCC->getOperand(0).getValueType())) { @@ -1657,6 +1691,20 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT, case ISD::SETULE: CondCode = BE; break; case ISD::SETUGE: CondCode = AE; break; } + } else if (X86ScalarSSE) { + switch (SetCC->getCondition()) { + default: assert(0 && "Unknown scalar fp comparison!"); + case ISD::SETEQ: CondCode = EQ; break; + case ISD::SETNE: CondCode = NE; break; + case ISD::SETULT: + case ISD::SETLT: CondCode = LT; break; + case ISD::SETULE: + case ISD::SETLE: CondCode = LE; break; + case ISD::SETUGT: + case ISD::SETGT: CondCode = GT; break; + case ISD::SETUGE: + case ISD::SETGE: CondCode = GE; break; + } } else { // On a floating point condition, the flags are set as follows: // ZF PF CF op @@ -1693,6 +1741,79 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT, } } + // There's no SSE equivalent of FCMOVE. In some cases we can fake it up, in + // Others we will have to do the PowerPC thing and generate an MBB for the + // true and false values and select between them with a PHI. + if (X86ScalarSSE) { + if (CondCode != NOT_SET) { + unsigned CMPSOpc = (SVT == MVT::f64) ? X86::CMPSDrr : X86::CMPSSrr; + unsigned CMPSImm = SSE_CMOVTAB[CondCode]; + // FIXME check for min + // FIXME check for max + // FIXME check for reverse + unsigned LHS = SelectExpr(Cond.getOperand(0)); + unsigned RHS = SelectExpr(Cond.getOperand(1)); + // emit compare mask + unsigned MaskReg = MakeReg(SVT); + BuildMI(BB, CMPSOpc, 3, MaskReg).addReg(LHS).addReg(RHS).addImm(CMPSImm); + // emit and with mask + unsigned TrueMask = MakeReg(SVT); + unsigned AndOpc = (SVT == MVT::f32) ? X86::ANDPSrr : X86::ANDPDrr; + BuildMI(BB, AndOpc, 2, TrueMask).addReg(RTrue).addReg(MaskReg); + // emit and with inverse mask + unsigned FalseMask = MakeReg(SVT); + unsigned AndnOpc = (SVT == MVT::f32) ? X86::ANDNPSrr : X86::ANDNPDrr; + BuildMI(BB, AndnOpc, 2, FalseMask).addReg(RFalse).addReg(MaskReg); + // emit or into dest reg + unsigned OROpc = (SVT == MVT::f32) ? X86::ORPSrr : X86::ORPDrr; + BuildMI(BB, OROpc, 2, RDest).addReg(TrueMask).addReg(FalseMask); + return; + } else { + // do the test and branch thing + // Get the condition into the zero flag. + unsigned CondReg = SelectExpr(Cond); + BuildMI(BB, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg); + + // Create an iterator with which to insert the MBB for copying the false + // value and the MBB to hold the PHI instruction for this SetCC. + MachineBasicBlock *thisMBB = BB; + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + ilist::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC sinkMBB + // fallthrough --> copy0MBB + MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB); + BuildMI(BB, X86::JNE, 1).addMBB(sinkMBB); + MachineFunction *F = BB->getParent(); + F->getBasicBlockList().insert(It, copy0MBB); + F->getBasicBlockList().insert(It, sinkMBB); + // Update machine-CFG edges + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(BB, X86::PHI, 4, RDest).addReg(RFalse) + .addMBB(copy0MBB).addReg(RTrue).addMBB(thisMBB); + } + return; + } + unsigned Opc = 0; if (CondCode != NOT_SET) { switch (SVT) { @@ -1702,7 +1823,7 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT, case MVT::f64: Opc = CMOVTABFP[CondCode]; break; } } - + // Finally, if we weren't able to fold this, just emit the condition and test // it. if (CondCode == NOT_SET || Opc == 0) { @@ -1757,8 +1878,8 @@ void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) { return; } } else if (ConstantFPSDNode *CN = dyn_cast(RHS)) { - if (CN->isExactlyValue(+0.0) || - CN->isExactlyValue(-0.0)) { + if (!X86ScalarSSE && (CN->isExactlyValue(+0.0) || + CN->isExactlyValue(-0.0))) { unsigned Reg = SelectExpr(LHS); BuildMI(BB, X86::FTST, 1).addReg(Reg); BuildMI(BB, X86::FNSTSW8r, 0); @@ -1791,7 +1912,8 @@ void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) { case MVT::i8: Opc = X86::CMP8rr; break; case MVT::i16: Opc = X86::CMP16rr; break; case MVT::i32: Opc = X86::CMP32rr; break; - case MVT::f64: Opc = X86::FUCOMIr; break; + case MVT::f32: Opc = X86::UCOMISSrr; break; + case MVT::f64: Opc = X86ScalarSSE ? X86::UCOMISDrr : X86::FUCOMIr; break; } unsigned Tmp1, Tmp2; if (getRegPressure(LHS) > getRegPressure(RHS)) { @@ -2040,6 +2162,11 @@ unsigned ISel::SelectExpr(SDOperand N) { default: Node->dump(); assert(0 && "Node not handled!\n"); + case ISD::FP_EXTEND: + assert(X86ScalarSSE && "Scalar SSE FP must be enabled to use f32"); + Tmp1 = SelectExpr(N.getOperand(0)); + BuildMI(BB, X86::CVTSS2SDrr, 1, Result).addReg(Tmp1); + return Result; case ISD::CopyFromReg: Select(N.getOperand(0)); if (Result == 1) { @@ -2212,6 +2339,37 @@ unsigned ISel::SelectExpr(SDOperand N) { case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: { + Tmp1 = SelectExpr(N.getOperand(0)); // Get the operand register + unsigned PromoteOpcode = 0; + + // We can handle any sint to fp, and 8 and 16 uint to fp with the direct + // sse conversion instructions. + if (X86ScalarSSE) { + MVT::ValueType SrcTy = N.getOperand(0).getValueType(); + MVT::ValueType DstTy = N.getValueType(); + switch (SrcTy) { + case MVT::i1: + case MVT::i8: + PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ? + X86::MOVZX32rr8 : X86::MOVSX32rr8; + break; + case MVT::i16: + PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ? + X86::MOVZX32rr16 : X86::MOVSX32rr16; + break; + default: + assert(N.getOpcode() != ISD::UINT_TO_FP); + break; + } + if (PromoteOpcode) { + BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1); + Tmp1 = Tmp2; + } + Opc = (DstTy == MVT::f64) ? X86::CVTSI2SDrr : X86::CVTSI2SSrr; + BuildMI(BB, Opc, 1, Result).addReg(Tmp1); + return Result; + } + // FIXME: Most of this grunt work should be done by legalize! ContainsFPCode = true; @@ -2221,8 +2379,6 @@ unsigned ISel::SelectExpr(SDOperand N) { // MVT::ValueType PromoteType = MVT::Other; MVT::ValueType SrcTy = N.getOperand(0).getValueType(); - unsigned PromoteOpcode = 0; - unsigned RealDestReg = Result; switch (SrcTy) { case MVT::i1: case MVT::i8: @@ -2245,8 +2401,6 @@ unsigned ISel::SelectExpr(SDOperand N) { break; } - Tmp1 = SelectExpr(N.getOperand(0)); // Get the operand register - if (PromoteType != MVT::Other) { Tmp2 = MakeReg(PromoteType); BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1); @@ -2272,31 +2426,28 @@ unsigned ISel::SelectExpr(SDOperand N) { break; default: break; // No promotion required. } - - if (Node->getOpcode() == ISD::UINT_TO_FP && Result != RealDestReg) { - // If this is a cast from uint -> double, we need to be careful when if - // the "sign" bit is set. If so, we don't want to make a negative number, - // we want to make a positive number. Emit code to add an offset if the - // sign bit is set. - - // Compute whether the sign bit is set by shifting the reg right 31 bits. - unsigned IsNeg = MakeReg(MVT::i32); - BuildMI(BB, X86::SHR32ri, 2, IsNeg).addReg(Tmp1).addImm(31); - - // Create a CP value that has the offset in one word and 0 in the other. - static ConstantInt *TheOffset = ConstantUInt::get(Type::ULongTy, - 0x4f80000000000000ULL); - unsigned CPI = F->getConstantPool()->getConstantPoolIndex(TheOffset); - BuildMI(BB, X86::FADD32m, 5, RealDestReg).addReg(Result) - .addConstantPoolIndex(CPI).addZImm(4).addReg(IsNeg).addSImm(0); - } - return RealDestReg; + return Result; } case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: { // FIXME: Most of this grunt work should be done by legalize! Tmp1 = SelectExpr(N.getOperand(0)); // Get the operand register + // If the target supports SSE2 and is performing FP operations in SSE regs + // instead of the FP stack, then we can use the efficient CVTSS2SI and + // CVTSD2SI instructions. + if (ISD::FP_TO_SINT == N.getOpcode() && X86ScalarSSE) { + if (MVT::f32 == N.getOperand(0).getValueType()) { + BuildMI(BB, X86::CVTSS2SIrr, 1, Result).addReg(Tmp1); + } else if (MVT::f64 == N.getOperand(0).getValueType()) { + BuildMI(BB, X86::CVTSD2SIrr, 1, Result).addReg(Tmp1); + } else { + assert(0 && "Not an f32 or f64?"); + abort(); + } + return Result; + } + // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. // @@ -2385,9 +2536,15 @@ unsigned ISel::SelectExpr(SDOperand N) { case MVT::i8: Opc = X86::ADD8rm; break; case MVT::i16: Opc = X86::ADD16rm; break; case MVT::i32: Opc = X86::ADD32rm; break; + case MVT::f32: Opc = X86::ADDSSrm; break; case MVT::f64: // For F64, handle promoted load operations (from F32) as well! - Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m; + if (X86ScalarSSE) { + assert(Op1.getOpcode() == ISD::LOAD && "SSE load not promoted"); + Opc = X86::ADDSDrm; + } else { + Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m; + } break; } X86AddressMode AM; @@ -2458,7 +2615,8 @@ unsigned ISel::SelectExpr(SDOperand N) { case MVT::i8: Opc = X86::ADD8rr; break; case MVT::i16: Opc = X86::ADD16rr; break; case MVT::i32: Opc = X86::ADD32rr; break; - case MVT::f64: Opc = X86::FpADD; break; + case MVT::f32: Opc = X86::ADDSSrr; break; + case MVT::f64: Opc = X86ScalarSSE ? X86::ADDSDrr : X86::FpADD; break; } if (getRegPressure(Op0) > getRegPressure(Op1)) { @@ -2472,18 +2630,29 @@ unsigned ISel::SelectExpr(SDOperand N) { BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2); return Result; + case ISD::FSQRT: + Tmp1 = SelectExpr(Node->getOperand(0)); + if (X86ScalarSSE) { + Opc = (N.getValueType() == MVT::f32) ? X86::SQRTSSrr : X86::SQRTSDrr; + BuildMI(BB, Opc, 1, Result).addReg(Tmp1); + } else { + BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1); + } + return Result; + + // FIXME: + // Once we can spill 16 byte constants into the constant pool, we can + // implement SSE equivalents of FABS and FCHS. case ISD::FABS: case ISD::FNEG: case ISD::FSIN: case ISD::FCOS: - case ISD::FSQRT: assert(N.getValueType()==MVT::f64 && "Illegal type for this operation"); Tmp1 = SelectExpr(Node->getOperand(0)); switch (N.getOpcode()) { default: assert(0 && "Unreachable!"); case ISD::FABS: BuildMI(BB, X86::FABS, 1, Result).addReg(Tmp1); break; case ISD::FNEG: BuildMI(BB, X86::FCHS, 1, Result).addReg(Tmp1); break; - case ISD::FSQRT: BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1); break; case ISD::FSIN: BuildMI(BB, X86::FSIN, 1, Result).addReg(Tmp1); break; case ISD::FCOS: BuildMI(BB, X86::FCOS, 1, Result).addReg(Tmp1); break; } @@ -2550,11 +2719,21 @@ unsigned ISel::SelectExpr(SDOperand N) { X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::FSUB32m, X86::FSUB64m, X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::FpSUB , X86::FpSUB, }; + static const unsigned SSE_SUBTab[] = { + X86::SUB8ri, X86::SUB16ri, X86::SUB32ri, 0, 0, + X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::SUBSSrm, X86::SUBSDrm, + X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::SUBSSrr, X86::SUBSDrr, + }; static const unsigned MULTab[] = { 0, X86::IMUL16rri, X86::IMUL32rri, 0, 0, 0, X86::IMUL16rm , X86::IMUL32rm, X86::FMUL32m, X86::FMUL64m, 0, X86::IMUL16rr , X86::IMUL32rr, X86::FpMUL , X86::FpMUL, }; + static const unsigned SSE_MULTab[] = { + 0, X86::IMUL16rri, X86::IMUL32rri, 0, 0, + 0, X86::IMUL16rm , X86::IMUL32rm, X86::MULSSrm, X86::MULSDrm, + 0, X86::IMUL16rr , X86::IMUL32rr, X86::MULSSrr, X86::MULSDrr, + }; static const unsigned ANDTab[] = { X86::AND8ri, X86::AND16ri, X86::AND32ri, 0, 0, X86::AND8rm, X86::AND16rm, X86::AND32rm, 0, 0, @@ -2637,8 +2816,8 @@ unsigned ISel::SelectExpr(SDOperand N) { } switch (Node->getOpcode()) { default: assert(0 && "Unreachable!"); - case ISD::SUB: Opc = SUBTab[Opc]; break; - case ISD::MUL: Opc = MULTab[Opc]; break; + case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break; + case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break; case ISD::AND: Opc = ANDTab[Opc]; break; case ISD::OR: Opc = ORTab[Opc]; break; case ISD::XOR: Opc = XORTab[Opc]; break; @@ -2656,7 +2835,7 @@ unsigned ISel::SelectExpr(SDOperand N) { goto FoldOps; } else { // For FP, emit 'reverse' subract, with a memory operand. - if (N.getValueType() == MVT::f64) { + if (N.getValueType() == MVT::f64 && !X86ScalarSSE) { if (Op0.getOpcode() == ISD::EXTLOAD) Opc = X86::FSUBR32m; else @@ -2678,13 +2857,17 @@ unsigned ISel::SelectExpr(SDOperand N) { case MVT::i8: Opc = 5; break; case MVT::i16: Opc = 6; break; case MVT::i32: Opc = 7; break; + case MVT::f32: Opc = 8; break; // For F64, handle promoted load operations (from F32) as well! - case MVT::f64: Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break; + case MVT::f64: + assert((!X86ScalarSSE || Op1.getOpcode() == ISD::LOAD) && + "SSE load should have been promoted"); + Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break; } switch (Node->getOpcode()) { default: assert(0 && "Unreachable!"); - case ISD::SUB: Opc = SUBTab[Opc]; break; - case ISD::MUL: Opc = MULTab[Opc]; break; + case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break; + case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break; case ISD::AND: Opc = ANDTab[Opc]; break; case ISD::OR: Opc = ORTab[Opc]; break; case ISD::XOR: Opc = XORTab[Opc]; break; @@ -2725,8 +2908,8 @@ unsigned ISel::SelectExpr(SDOperand N) { } switch (Node->getOpcode()) { default: assert(0 && "Unreachable!"); - case ISD::SUB: Opc = SUBTab[Opc]; break; - case ISD::MUL: Opc = MULTab[Opc]; break; + case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break; + case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break; case ISD::AND: Opc = ANDTab[Opc]; break; case ISD::OR: Opc = ORTab[Opc]; break; case ISD::XOR: Opc = XORTab[Opc]; break; @@ -2844,7 +3027,7 @@ unsigned ISel::SelectExpr(SDOperand N) { if (N.getOpcode() == ISD::SDIV) { // We can fold loads into FpDIVs, but not really into any others. - if (N.getValueType() == MVT::f64) { + if (N.getValueType() == MVT::f64 || !X86ScalarSSE) { // Check for reversed and unreversed DIV. if (isFoldableLoad(N.getOperand(0), N.getOperand(1), true)) { if (N.getOperand(0).getOpcode() == ISD::EXTLOAD) @@ -2962,8 +3145,12 @@ unsigned ISel::SelectExpr(SDOperand N) { ClrOpcode = X86::MOV32ri; SExtOpcode = X86::CDQ; break; + case MVT::f32: + BuildMI(BB, X86::DIVSSrr, 2, Result).addReg(Tmp1).addReg(Tmp2); + return Result; case MVT::f64: - BuildMI(BB, X86::FpDIV, 2, Result).addReg(Tmp1).addReg(Tmp2); + Opc = X86ScalarSSE ? X86::DIVSDrr : X86::FpDIV; + BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2); return Result; } @@ -3108,7 +3295,15 @@ unsigned ISel::SelectExpr(SDOperand N) { case MVT::i8: Opc = X86::MOV8rm; break; case MVT::i16: Opc = X86::MOV16rm; break; case MVT::i32: Opc = X86::MOV32rm; break; - case MVT::f64: Opc = X86::FLD64m; ContainsFPCode = true; break; + case MVT::f32: Opc = X86::MOVSSrm; break; + case MVT::f64: + if (X86ScalarSSE) { + Opc = X86::MOVSDrm; + } else { + Opc = X86::FLD64m; + ContainsFPCode = true; + } + break; } if (ConstantPoolSDNode *CP = dyn_cast(N.getOperand(1))){ @@ -3385,9 +3580,21 @@ unsigned ISel::SelectExpr(SDOperand N) { BuildMI(BB, X86::MOV32rr, 1, Result+1).addReg(X86::EDX); break; case MVT::f64: // Floating-point return values live in %ST(0) - ContainsFPCode = true; - BuildMI(BB, X86::FpGETRESULT, 1, Result); - break; + if (X86ScalarSSE) { + ContainsFPCode = true; + BuildMI(BB, X86::FpGETRESULT, 1, X86::FP0); + + unsigned Size = MVT::getSizeInBits(MVT::f64)/8; + MachineFunction *F = BB->getParent(); + int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size); + addFrameReference(BuildMI(BB, X86::FST64m, 5), FrameIdx).addReg(X86::FP0); + addFrameReference(BuildMI(BB, X86::MOVSDrm, 4, Result), FrameIdx); + break; + } else { + ContainsFPCode = true; + BuildMI(BB, X86::FpGETRESULT, 1, Result); + break; + } } return Result+N.ResNo-1; } @@ -3977,7 +4184,15 @@ void ISel::Select(SDOperand N) { case MVT::i8: Opc = X86::MOV8rr; break; case MVT::i16: Opc = X86::MOV16rr; break; case MVT::i32: Opc = X86::MOV32rr; break; - case MVT::f64: Opc = X86::FpMOV; ContainsFPCode = true; break; + case MVT::f32: Opc = X86::MOVAPSrr; break; + case MVT::f64: + if (X86ScalarSSE) { + Opc = X86::MOVAPDrr; + } else { + Opc = X86::FpMOV; + ContainsFPCode = true; + } + break; } BuildMI(BB, Opc, 1, Tmp2).addReg(Tmp1); } @@ -4018,12 +4233,38 @@ void ISel::Select(SDOperand N) { } switch (N.getOperand(1).getValueType()) { default: assert(0 && "All other types should have been promoted!!"); + case MVT::f32: + if (X86ScalarSSE) { + // Spill the value to memory and reload it into top of stack. + unsigned Size = MVT::getSizeInBits(MVT::f32)/8; + MachineFunction *F = BB->getParent(); + int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size); + addFrameReference(BuildMI(BB, X86::MOVSSmr, 5), FrameIdx).addReg(Tmp1); + addFrameReference(BuildMI(BB, X86::FLD32m, 4, X86::FP0), FrameIdx); + BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0); + ContainsFPCode = true; + } else { + assert(0 && "MVT::f32 only legal with scalar sse fp"); + abort(); + } + break; case MVT::f64: - BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1); - break; + if (X86ScalarSSE) { + // Spill the value to memory and reload it into top of stack. + unsigned Size = MVT::getSizeInBits(MVT::f64)/8; + MachineFunction *F = BB->getParent(); + int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size); + addFrameReference(BuildMI(BB, X86::MOVSDmr, 5), FrameIdx).addReg(Tmp1); + addFrameReference(BuildMI(BB, X86::FLD64m, 4, X86::FP0), FrameIdx); + BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0); + ContainsFPCode = true; + } else { + BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1); + } + break; case MVT::i32: - BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1); - break; + BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1); + break; } break; case 1: @@ -4144,7 +4385,9 @@ void ISel::Select(SDOperand N) { switch (StoredTy) { default: assert(0 && "Cannot truncstore this type!"); case MVT::i1: Opc = X86::MOV8mr; break; - case MVT::f32: Opc = X86::FST32m; break; + case MVT::f32: + assert(!X86ScalarSSE && "Cannot truncstore scalar SSE regs"); + Opc = X86::FST32m; break; } std::vector > RP; @@ -4176,7 +4419,6 @@ void ISel::Select(SDOperand N) { case MVT::i8: Opc = X86::MOV8mi; break; case MVT::i16: Opc = X86::MOV16mi; break; case MVT::i32: Opc = X86::MOV32mi; break; - case MVT::f64: break; } if (Opc) { if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(2))) { @@ -4215,7 +4457,8 @@ void ISel::Select(SDOperand N) { case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; case MVT::i32: Opc = X86::MOV32mr; break; - case MVT::f64: Opc = X86::FST64m; break; + case MVT::f32: Opc = X86::MOVSSmr; break; + case MVT::f64: Opc = X86ScalarSSE ? X86::MOVSDmr : X86::FST64m; break; } std::vector > RP; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index bda2cb73f6f..957360b2013 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -28,7 +28,7 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI, unsigned& destReg) const { MachineOpCode oc = MI.getOpcode(); if (oc == X86::MOV8rr || oc == X86::MOV16rr || oc == X86::MOV32rr || - oc == X86::FpMOV) { + oc == X86::FpMOV || oc == X86::MOVAPDrr) { assert(MI.getNumOperands() == 2 && MI.getOperand(0).isRegister() && MI.getOperand(1).isRegister() && diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 5b63ff93f5c..95e8205a00b 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -107,6 +107,10 @@ namespace X86II { DA = 5 << Op0Shift, DB = 6 << Op0Shift, DC = 7 << Op0Shift, DD = 8 << Op0Shift, DE = 9 << Op0Shift, DF = 10 << Op0Shift, + + // XS, XD - These prefix codes are for single and double precision scalar + // floating point operations performed in the SSE registers. + XD = 11 << Op0Shift, XS = 12 << Op0Shift, //===------------------------------------------------------------------===// // This two-bit field describes the size of an immediate operand. Zero is diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 39a4317bc2b..1376d8fe8f0 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -187,7 +187,8 @@ def JG : IBr<0x8F, (ops i32imm:$dst), "jg $dst">, TB; // let isCall = 1 in // All calls clobber the non-callee saved registers... - let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0] in { + let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + XMM0, XMM1, XMM2, XMM3] in { def CALLpcrel32 : I<0xE8, RawFrm, (ops calltarget:$dst), "call $dst">; def CALL32r : I<0xFF, MRM2r, (ops R32:$dst), "call {*}$dst">; def CALL32m : I<0xFF, MRM2m, (ops i32mem:$dst), "call {*}$dst">; @@ -1436,6 +1437,23 @@ def CVTSS2SDrr: I<0x5A, MRMSrcReg, (ops R32:$dst, RXMM:$src), "cvtss2sd {$src, $dst|$dst, $src}">, XD; def CVTSS2SDrm: I<0x5A, MRMSrcMem, (ops R32:$dst, f32mem:$src), "cvtss2sd {$src, $dst|$dst, $src}">, XD; +def CVTSI2SSrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src), + "cvtsi2ss {$src, $dst|$dst, $src}">, XS; +def CVTSI2SSrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f32mem:$src), + "cvtsi2ss {$src, $dst|$dst, $src}">, XS; +def CVTSI2SDrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src), + "cvtsi2sd {$src, $dst|$dst, $src}">, XD; +def CVTSI2SDrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f64mem:$src), + "cvtsi2sd {$src, $dst|$dst, $src}">, XD; + +def SQRTSSrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f32mem:$src), + "subss {$src, $dst|$dst, $src}">, XS; +def SQRTSSrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src), + "subss {$src, $dst|$dst, $src}">, XS; +def SQRTSDrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f64mem:$src), + "subsd {$src, $dst|$dst, $src}">, XD; +def SQRTSDrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src), + "subsd {$src, $dst|$dst, $src}">, XD; def UCOMISDrr: I<0x2E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src), "ucomisd {$src, $dst|$dst, $src}">, TB, OpSize; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 08920cc2605..230debf7a7a 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -52,6 +52,7 @@ static unsigned getIdx(unsigned SpillSize) { case 32: return 2; case 64: return 3; // FP in 64-bit spill mode. case 80: return 4; // FP in 80-bit spill mode. + case 128: return 5; // XMM reg in 128 bit mode. } } @@ -59,18 +60,24 @@ void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, int FrameIdx) const { static const unsigned Opcode[] = - { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m }; + { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m, + X86::MOVAPDmr }; unsigned Idx = getIdx(getSpillSize(SrcReg)); - addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 5), FrameIdx).addReg(SrcReg); + unsigned Opc = Opcode[Idx]; + if (X86ScalarSSE && Opc == X86::FST64m) Opc = X86::MOVSDmr; + addFrameReference(BuildMI(MBB, MI, Opc, 5), FrameIdx).addReg(SrcReg); } void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIdx)const{ static const unsigned Opcode[] = - { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m }; + { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m, + X86::MOVAPDrm }; unsigned Idx = getIdx(getSpillSize(DestReg)); - addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 4, DestReg), FrameIdx); + unsigned Opc = Opcode[Idx]; + if (X86ScalarSSE && Opc == X86::FLD64m) Opc = X86::MOVSDrm; + addFrameReference(BuildMI(MBB, MI, Opc, 4, DestReg), FrameIdx); } void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB, @@ -78,8 +85,11 @@ void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, const TargetRegisterClass *RC) const { static const unsigned Opcode[] = - { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV }; - BuildMI(MBB, MI, Opcode[getIdx(RC->getSize()*8)], 1, DestReg).addReg(SrcReg); + { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV, + X86::MOVAPDrr }; + unsigned Opc = Opcode[getIdx(RC->getSize()*8)]; + if (X86ScalarSSE && Opc == X86::FpMOV) Opc = X86::MOVAPDrr; + BuildMI(MBB, MI, Opc, 1, DestReg).addReg(SrcReg); } static MachineInstr *MakeMInst(unsigned Opcode, unsigned FrameIndex, diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index da8e612daef..30190fc18c4 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -99,8 +99,8 @@ def R32 : RegisterClass { // FIXME: These registers can contain both integer and fp values. We should // figure out the right way to deal with that. For now, since they'll be used // for scalar FP, they are being declared f64 -def RXMM : RegisterClass; +def RXMM : RegisterClass; // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 2330182372c..def4f9cfa49 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -26,6 +26,7 @@ using namespace llvm; X86VectorEnum llvm::X86Vector = NoSSE; +bool llvm::X86ScalarSSE = false; /// X86TargetMachineModule - Note that this is used on hosts that cannot link /// in a library unless there are references into the library. In particular, @@ -41,8 +42,11 @@ namespace { cl::opt DisableOutput("disable-x86-llc-output", cl::Hidden, cl::desc("Disable the X86 asm printer, for use " "when profiling the code generator.")); + cl::opt EnableSSEFP("enable-sse-scalar-fp", + cl::desc("Perform FP math in SSE regs instead of the FP stack"), + cl::location(X86ScalarSSE), + cl::init(false)); -#if 0 // FIXME: This should eventually be handled with target triples and // subtarget support! cl::opt @@ -54,7 +58,6 @@ namespace { clEnumValN(SSE3, "sse3", " Enable SSE, SSE2, and SSE3 support"), clEnumValEnd), cl::location(X86Vector), cl::init(NoSSE)); -#endif // Register the target. RegisterTarget X("x86", " IA-32 (Pentium and above)"); @@ -91,6 +94,8 @@ X86TargetMachine::X86TargetMachine(const Module &M, IntrinsicLowering *IL) : TargetMachine("X86", IL, true, 4, 4, 4, 4, 4), FrameInfo(TargetFrameInfo::StackGrowsDown, 8, -4), JITInfo(*this) { + // Scalar SSE FP requires at least SSE2 + X86ScalarSSE &= X86Vector >= SSE2; }