Re-apply 40504, but with a fix for the segfault it caused in oggenc:

Make the alignedload and alignedstore patterns always require 16-byte alignment. This way when they are used in the "Fs" instructions, in which a vector instruction is used for a scalar purpose, they can still require the full vector alignment. And add a regression test for this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@40555 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-06 09:44:39 +00:00 · 2007-07-27 17:16:43 +00:00 · 2007-07-27 17:16:43 +00:00 · d300622eba
commit d300622eba
parent 9d3e79107d
5 changed files with 57 additions and 76 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -3367,14 +3367,10 @@ SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
    CV.push_back(C);
    CV.push_back(C);
  }
-  Constant *CS = ConstantStruct::get(CV);
-  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
-  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-  SmallVector<SDOperand, 3> Ops;
-  Ops.push_back(DAG.getEntryNode());
-  Ops.push_back(CPIdx);
-  Ops.push_back(DAG.getSrcValue(NULL));
-  SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  Constant *C = ConstantVector::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+  SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0,
+                               false, 16);
  return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask);
 }

@ -3399,21 +3395,16 @@ SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
    CV.push_back(C);
    CV.push_back(C);
  }
-  Constant *CS = ConstantStruct::get(CV);
-  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  Constant *C = ConstantVector::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+  SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0,
+                               false, 16);
  if (MVT::isVector(VT)) {
-    SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0);
    return DAG.getNode(ISD::BIT_CONVERT, VT,
                       DAG.getNode(ISD::XOR, MVT::v2i64,
                    DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Op.getOperand(0)),
                    DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, Mask)));
  } else {
-    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-    SmallVector<SDOperand, 3> Ops;
-    Ops.push_back(DAG.getEntryNode());
-    Ops.push_back(CPIdx);
-    Ops.push_back(DAG.getSrcValue(NULL));
-    SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
    return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask);
  }
 }
@ -3442,14 +3433,10 @@ SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
    CV.push_back(ConstantFP::get(SrcTy, 0.0));
    CV.push_back(ConstantFP::get(SrcTy, 0.0));
  }
-  Constant *CS = ConstantStruct::get(CV);
-  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
-  SDVTList Tys = DAG.getVTList(SrcVT, MVT::Other);
-  SmallVector<SDOperand, 3> Ops;
-  Ops.push_back(DAG.getEntryNode());
-  Ops.push_back(CPIdx);
-  Ops.push_back(DAG.getSrcValue(NULL));
-  SDOperand Mask1 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  Constant *C = ConstantVector::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+  SDOperand Mask1 = DAG.getLoad(SrcVT, DAG.getEntryNode(), CPIdx, NULL, 0,
+                                false, 16);
  SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1);

  // Shift sign bit right or left if the two operands have different types.
@ -3474,14 +3461,10 @@ SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
    CV.push_back(ConstantFP::get(SrcTy, 0.0));
    CV.push_back(ConstantFP::get(SrcTy, 0.0));
  }
-  CS = ConstantStruct::get(CV);
-  CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
-  Tys = DAG.getVTList(VT, MVT::Other);
-  Ops.clear();
-  Ops.push_back(DAG.getEntryNode());
-  Ops.push_back(CPIdx);
-  Ops.push_back(DAG.getSrcValue(NULL));
-  SDOperand Mask2 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
+  SDOperand Mask2 = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0,
+                                false, 16);
  SDOperand Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2);

  // Or the value with the sign bit.
@ -4357,8 +4340,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
-  case X86ISD::LOAD_PACK:          return "X86ISD::LOAD_PACK";
-  case X86ISD::LOAD_UA:            return "X86ISD::LOAD_UA";
  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
  case X86ISD::S2VEC:              return "X86ISD::S2VEC";
@ -4756,19 +4737,14 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
  }

  bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget);
+  LoadSDNode *LD = cast<LoadSDNode>(Base);
  if (isAlign16) {
-    LoadSDNode *LD = cast<LoadSDNode>(Base);
    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
-                       LD->getSrcValueOffset());
+                       LD->getSrcValueOffset(), LD->isVolatile());
  } else {
-    // Just use movups, it's shorter.
-    SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
-    SmallVector<SDOperand, 3> Ops;
-    Ops.push_back(Base->getOperand(0));
-    Ops.push_back(Base->getOperand(1));
-    Ops.push_back(Base->getOperand(2));
-    return DAG.getNode(ISD::BIT_CONVERT, VT,
-                       DAG.getNode(X86ISD::LOAD_UA, Tys, &Ops[0], Ops.size()));
+    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
+                       LD->getSrcValueOffset(), LD->isVolatile(),
+                       LD->getAlignment());
  }
 }

--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -143,14 +143,6 @@ namespace llvm {
      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
      REP_MOVS,

-      /// LOAD_PACK Load a 128-bit packed float / double value. It has the same
-      /// operands as a normal load.
-      LOAD_PACK,
-
-      /// LOAD_UA Load an unaligned 128-bit value. It has the same operands as
-      /// a normal load.
-      LOAD_UA,
-
      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
      /// at function entry, used for PIC code.
      GlobalBaseReg,
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -21,8 +21,6 @@
 def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
                                            SDTCisFP<0>, SDTCisInt<2> ]>;

-def X86loadp   : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>;
-def X86loadu   : SDNode<"X86ISD::LOAD_UA",   SDTLoad, [SDNPHasChain]>;
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
 def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
@ -82,33 +80,32 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//

-def X86loadpf32  : PatFrag<(ops node:$ptr), (f32   (X86loadp node:$ptr))>;
-def X86loadpf64  : PatFrag<(ops node:$ptr), (f64   (X86loadp node:$ptr))>;
-
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;

-// Like 'store', but always requires natural alignment.
+// Like 'store', but always requires vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                           (st node:$val, node:$ptr), [{
  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
    return !ST->isTruncatingStore() &&
           ST->getAddressingMode() == ISD::UNINDEXED &&
-           ST->getAlignment() * 8 >= MVT::getSizeInBits(ST->getStoredVT());
+           ST->getAlignment() >= 16;
  return false;
 }]>;

-// Like 'load', but always requires natural alignment.
+// Like 'load', but always requires vector alignment.
 def alignedload : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
    return LD->getExtensionType() == ISD::NON_EXTLOAD &&
           LD->getAddressingMode() == ISD::UNINDEXED &&
-           LD->getAlignment() * 8 >= MVT::getSizeInBits(LD->getLoadedVT());
+           LD->getAlignment() >= 16;
  return false;
 }]>;

+def alignedloadfsf32 : PatFrag<(ops node:$ptr), (f32   (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr), (f64   (alignedload node:$ptr))>;
 def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>;
 def alignedloadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (alignedload node:$ptr))>;
 def alignedloadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (alignedload node:$ptr))>;
@ -123,10 +120,12 @@ def memop : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
    return LD->getExtensionType() == ISD::NON_EXTLOAD &&
           LD->getAddressingMode() == ISD::UNINDEXED &&
-           LD->getAlignment() * 8 >= MVT::getSizeInBits(LD->getLoadedVT());
+           LD->getAlignment() >= 16;
  return false;
 }]>;

+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
@ -410,7 +409,7 @@ def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
 // disregarded.
 def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
                     "movaps {$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (X86loadpf32 addr:$src))]>;
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;

 // Alias bitwise logical operations using SSE logical ops on packed FP values.
 let isTwoAddress = 1 in {
@ -429,15 +428,15 @@ let isCommutable = 1 in {
 def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
                    "andps {$src2, $dst|$dst, $src2}",
                    [(set FR32:$dst, (X86fand FR32:$src1,
-                                      (X86loadpf32 addr:$src2)))]>;
+                                      (memopfsf32 addr:$src2)))]>;
 def FsORPSrm  : PSI<0x56, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
                    "orps {$src2, $dst|$dst, $src2}",
                    [(set FR32:$dst, (X86for FR32:$src1,
-                                      (X86loadpf32 addr:$src2)))]>;
+                                      (memopfsf32 addr:$src2)))]>;
 def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
                    "xorps {$src2, $dst|$dst, $src2}",
                    [(set FR32:$dst, (X86fxor FR32:$src1,
-                                      (X86loadpf32 addr:$src2)))]>;
+                                      (memopfsf32 addr:$src2)))]>;

 def FsANDNPSrr : PSI<0x55, MRMSrcReg,
                     (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
@ -1083,7 +1082,7 @@ def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
 // disregarded.
 def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
                     "movapd {$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (X86loadpf64 addr:$src))]>;
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;

 // Alias bitwise logical operations using SSE logical ops on packed FP values.
 let isTwoAddress = 1 in {
@ -1102,15 +1101,15 @@ let isCommutable = 1 in {
 def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
                    "andpd {$src2, $dst|$dst, $src2}",
                    [(set FR64:$dst, (X86fand FR64:$src1,
-                                      (X86loadpf64 addr:$src2)))]>;
+                                      (memopfsf64 addr:$src2)))]>;
 def FsORPDrm  : PDI<0x56, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
                    "orpd {$src2, $dst|$dst, $src2}",
                    [(set FR64:$dst, (X86for FR64:$src1,
-                                      (X86loadpf64 addr:$src2)))]>;
+                                      (memopfsf64 addr:$src2)))]>;
 def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
                    "xorpd {$src2, $dst|$dst, $src2}",
                    [(set FR64:$dst, (X86fxor FR64:$src1,
-                                      (X86loadpf64 addr:$src2)))]>;
+                                      (memopfsf64 addr:$src2)))]>;

 def FsANDNPDrr : PDI<0x55, MRMSrcReg,
                     (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
@ -2630,11 +2629,11 @@ def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
                  (load addr:$src2))),
          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;

-// Unaligned load
-def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>,
-      Requires<[HasSSE1]>;
-
 // Use movaps / movups for SSE integer load / store (one byte shorter).
+def : Pat<(alignedloadv4i32 addr:$src),
+          (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(loadv4i32 addr:$src),
+          (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
 def : Pat<(alignedloadv2i64 addr:$src),
          (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
 def : Pat<(loadv2i64 addr:$src),
--- a/test/CodeGen/X86/fsxor-alignment.ll
+++ b/test/CodeGen/X86/fsxor-alignment.ll
@ -0,0 +1,14 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse -enable-unsafe-fp-math | \
+; RUN:  grep -v sp | grep xorps | wc -l | grep 2
+
+; Don't fold the incoming stack arguments into the xorps instructions used
+; to do floating-point negations, because the arguments aren't vectors
+; and aren't vector-aligned.
+
+define void @foo(float* %p, float* %q, float %s, float %y) {
+  %ss = sub float -0.0, %s
+  %yy = sub float -0.0, %y
+  store float %ss, float* %p
+  store float %yy, float* %q
+  ret void
+}
--- a/test/CodeGen/X86/vec_shuffle.ll
+++ b/test/CodeGen/X86/vec_shuffle.ll
@ -1,6 +1,6 @@
 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -mattr=+sse2 -o %t -f
 ; RUN: grep shufp   %t | wc -l | grep 1 
-; RUN: grep movups  %t | wc -l | grep 1 
+; RUN: grep movupd  %t | wc -l | grep 1 
 ; RUN: grep pshufhw %t | wc -l | grep 1

 void %test_v4sf(<4 x float>* %P, float %X, float %Y) {