From d300622ebacde5bffb5b5e58142323e505df9dbe Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Fri, 27 Jul 2007 17:16:43 +0000 Subject: [PATCH] Re-apply 40504, but with a fix for the segfault it caused in oggenc: Make the alignedload and alignedstore patterns always require 16-byte alignment. This way when they are used in the "Fs" instructions, in which a vector instruction is used for a scalar purpose, they can still require the full vector alignment. And add a regression test for this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@40555 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 66 +++++++++-------------------- lib/Target/X86/X86ISelLowering.h | 8 ---- lib/Target/X86/X86InstrSSE.td | 43 +++++++++---------- test/CodeGen/X86/fsxor-alignment.ll | 14 ++++++ test/CodeGen/X86/vec_shuffle.ll | 2 +- 5 files changed, 57 insertions(+), 76 deletions(-) create mode 100644 test/CodeGen/X86/fsxor-alignment.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 23f9e9500c2..94505c45c0f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3367,14 +3367,10 @@ SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) { CV.push_back(C); CV.push_back(C); } - Constant *CS = ConstantStruct::get(CV); - SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SmallVector Ops; - Ops.push_back(DAG.getEntryNode()); - Ops.push_back(CPIdx); - Ops.push_back(DAG.getSrcValue(NULL)); - SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + Constant *C = ConstantVector::get(CV); + SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0, + false, 16); return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask); } @@ -3399,21 +3395,16 @@ SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) { CV.push_back(C); CV.push_back(C); } - Constant *CS = ConstantStruct::get(CV); - SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); + Constant *C = ConstantVector::get(CV); + SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0, + false, 16); if (MVT::isVector(VT)) { - SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0); return DAG.getNode(ISD::BIT_CONVERT, VT, DAG.getNode(ISD::XOR, MVT::v2i64, DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Op.getOperand(0)), DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Mask))); } else { - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SmallVector Ops; - Ops.push_back(DAG.getEntryNode()); - Ops.push_back(CPIdx); - Ops.push_back(DAG.getSrcValue(NULL)); - SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask); } } @@ -3442,14 +3433,10 @@ SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) { CV.push_back(ConstantFP::get(SrcTy, 0.0)); CV.push_back(ConstantFP::get(SrcTy, 0.0)); } - Constant *CS = ConstantStruct::get(CV); - SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); - SDVTList Tys = DAG.getVTList(SrcVT, MVT::Other); - SmallVector Ops; - Ops.push_back(DAG.getEntryNode()); - Ops.push_back(CPIdx); - Ops.push_back(DAG.getSrcValue(NULL)); - SDOperand Mask1 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + Constant *C = ConstantVector::get(CV); + SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDOperand Mask1 = DAG.getLoad(SrcVT, DAG.getEntryNode(), CPIdx, NULL, 0, + false, 16); SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1); // Shift sign bit right or left if the two operands have different types. @@ -3474,14 +3461,10 @@ SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) { CV.push_back(ConstantFP::get(SrcTy, 0.0)); CV.push_back(ConstantFP::get(SrcTy, 0.0)); } - CS = ConstantStruct::get(CV); - CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); - Tys = DAG.getVTList(VT, MVT::Other); - Ops.clear(); - Ops.push_back(DAG.getEntryNode()); - Ops.push_back(CPIdx); - Ops.push_back(DAG.getSrcValue(NULL)); - SDOperand Mask2 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + C = ConstantVector::get(CV); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 4); + SDOperand Mask2 = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0, + false, 16); SDOperand Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2); // Or the value with the sign bit. @@ -4357,8 +4340,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; - case X86ISD::LOAD_PACK: return "X86ISD::LOAD_PACK"; - case X86ISD::LOAD_UA: return "X86ISD::LOAD_UA"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::S2VEC: return "X86ISD::S2VEC"; @@ -4756,19 +4737,14 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget); + LoadSDNode *LD = cast(Base); if (isAlign16) { - LoadSDNode *LD = cast(Base); return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), - LD->getSrcValueOffset()); + LD->getSrcValueOffset(), LD->isVolatile()); } else { - // Just use movups, it's shorter. - SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); - SmallVector Ops; - Ops.push_back(Base->getOperand(0)); - Ops.push_back(Base->getOperand(1)); - Ops.push_back(Base->getOperand(2)); - return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(X86ISD::LOAD_UA, Tys, &Ops[0], Ops.size())); + return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), + LD->getSrcValueOffset(), LD->isVolatile(), + LD->getAlignment()); } } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 07a96d35691..521916e035a 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -143,14 +143,6 @@ namespace llvm { /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. REP_MOVS, - /// LOAD_PACK Load a 128-bit packed float / double value. It has the same - /// operands as a normal load. - LOAD_PACK, - - /// LOAD_UA Load an unaligned 128-bit value. It has the same operands as - /// a normal load. - LOAD_UA, - /// GlobalBaseReg - On Darwin, this node represents the result of the popl /// at function entry, used for PIC code. GlobalBaseReg, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 9e0f75de997..327e44ccbd5 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -21,8 +21,6 @@ def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2> ]>; -def X86loadp : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>; -def X86loadu : SDNode<"X86ISD::LOAD_UA", SDTLoad, [SDNPHasChain]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, @@ -82,33 +80,32 @@ def sdmem : Operand { // SSE pattern fragments //===----------------------------------------------------------------------===// -def X86loadpf32 : PatFrag<(ops node:$ptr), (f32 (X86loadp node:$ptr))>; -def X86loadpf64 : PatFrag<(ops node:$ptr), (f64 (X86loadp node:$ptr))>; - def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; -// Like 'store', but always requires natural alignment. +// Like 'store', but always requires vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (st node:$val, node:$ptr), [{ if (StoreSDNode *ST = dyn_cast(N)) return !ST->isTruncatingStore() && ST->getAddressingMode() == ISD::UNINDEXED && - ST->getAlignment() * 8 >= MVT::getSizeInBits(ST->getStoredVT()); + ST->getAlignment() >= 16; return false; }]>; -// Like 'load', but always requires natural alignment. +// Like 'load', but always requires vector alignment. def alignedload : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ if (LoadSDNode *LD = dyn_cast(N)) return LD->getExtensionType() == ISD::NON_EXTLOAD && LD->getAddressingMode() == ISD::UNINDEXED && - LD->getAlignment() * 8 >= MVT::getSizeInBits(LD->getLoadedVT()); + LD->getAlignment() >= 16; return false; }]>; +def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32 (alignedload node:$ptr))>; +def alignedloadfsf64 : PatFrag<(ops node:$ptr), (f64 (alignedload node:$ptr))>; def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (alignedload node:$ptr))>; def alignedloadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (alignedload node:$ptr))>; @@ -123,10 +120,12 @@ def memop : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ if (LoadSDNode *LD = dyn_cast(N)) return LD->getExtensionType() == ISD::NON_EXTLOAD && LD->getAddressingMode() == ISD::UNINDEXED && - LD->getAlignment() * 8 >= MVT::getSizeInBits(LD->getLoadedVT()); + LD->getAlignment() >= 16; return false; }]>; +def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; +def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>; @@ -410,7 +409,7 @@ def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), // disregarded. def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), "movaps {$src, $dst|$dst, $src}", - [(set FR32:$dst, (X86loadpf32 addr:$src))]>; + [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>; // Alias bitwise logical operations using SSE logical ops on packed FP values. let isTwoAddress = 1 in { @@ -429,15 +428,15 @@ let isCommutable = 1 in { def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2), "andps {$src2, $dst|$dst, $src2}", [(set FR32:$dst, (X86fand FR32:$src1, - (X86loadpf32 addr:$src2)))]>; + (memopfsf32 addr:$src2)))]>; def FsORPSrm : PSI<0x56, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2), "orps {$src2, $dst|$dst, $src2}", [(set FR32:$dst, (X86for FR32:$src1, - (X86loadpf32 addr:$src2)))]>; + (memopfsf32 addr:$src2)))]>; def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2), "xorps {$src2, $dst|$dst, $src2}", [(set FR32:$dst, (X86fxor FR32:$src1, - (X86loadpf32 addr:$src2)))]>; + (memopfsf32 addr:$src2)))]>; def FsANDNPSrr : PSI<0x55, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2), @@ -1083,7 +1082,7 @@ def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), // disregarded. def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), "movapd {$src, $dst|$dst, $src}", - [(set FR64:$dst, (X86loadpf64 addr:$src))]>; + [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>; // Alias bitwise logical operations using SSE logical ops on packed FP values. let isTwoAddress = 1 in { @@ -1102,15 +1101,15 @@ let isCommutable = 1 in { def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2), "andpd {$src2, $dst|$dst, $src2}", [(set FR64:$dst, (X86fand FR64:$src1, - (X86loadpf64 addr:$src2)))]>; + (memopfsf64 addr:$src2)))]>; def FsORPDrm : PDI<0x56, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2), "orpd {$src2, $dst|$dst, $src2}", [(set FR64:$dst, (X86for FR64:$src1, - (X86loadpf64 addr:$src2)))]>; + (memopfsf64 addr:$src2)))]>; def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2), "xorpd {$src2, $dst|$dst, $src2}", [(set FR64:$dst, (X86fxor FR64:$src1, - (X86loadpf64 addr:$src2)))]>; + (memopfsf64 addr:$src2)))]>; def FsANDNPDrr : PDI<0x55, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2), @@ -2630,11 +2629,11 @@ def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), (load addr:$src2))), (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; -// Unaligned load -def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>, - Requires<[HasSSE1]>; - // Use movaps / movups for SSE integer load / store (one byte shorter). +def : Pat<(alignedloadv4i32 addr:$src), + (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>; +def : Pat<(loadv4i32 addr:$src), + (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>; def : Pat<(alignedloadv2i64 addr:$src), (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>; def : Pat<(loadv2i64 addr:$src), diff --git a/test/CodeGen/X86/fsxor-alignment.ll b/test/CodeGen/X86/fsxor-alignment.ll new file mode 100644 index 00000000000..bb55e4466f8 --- /dev/null +++ b/test/CodeGen/X86/fsxor-alignment.ll @@ -0,0 +1,14 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse -enable-unsafe-fp-math | \ +; RUN: grep -v sp | grep xorps | wc -l | grep 2 + +; Don't fold the incoming stack arguments into the xorps instructions used +; to do floating-point negations, because the arguments aren't vectors +; and aren't vector-aligned. + +define void @foo(float* %p, float* %q, float %s, float %y) { + %ss = sub float -0.0, %s + %yy = sub float -0.0, %y + store float %ss, float* %p + store float %yy, float* %q + ret void +} diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll index d06efa5ff26..16ce3dac549 100644 --- a/test/CodeGen/X86/vec_shuffle.ll +++ b/test/CodeGen/X86/vec_shuffle.ll @@ -1,6 +1,6 @@ ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -mattr=+sse2 -o %t -f ; RUN: grep shufp %t | wc -l | grep 1 -; RUN: grep movups %t | wc -l | grep 1 +; RUN: grep movupd %t | wc -l | grep 1 ; RUN: grep pshufhw %t | wc -l | grep 1 void %test_v4sf(<4 x float>* %P, float %X, float %Y) {