Codegen allonesvector better while using AVX: vpcmpeqd + vinsertf128

This also fixes PR10452 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136004 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-06 09:44:39 +00:00 · 2011-07-25 23:05:32 +00:00 · 2011-07-25 23:05:32 +00:00 · 863bd9d5cf
commit 863bd9d5cf
parent 51e92e8e41
5 changed files with 70 additions and 13 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -3831,21 +3831,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
 }

 /// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
-/// their original type, ensuring they get CSE'd.
+/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
+/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
+/// original type, ensuring they get CSE'd.
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
  assert(VT.isVector() && "Expected a vector type");
  assert((VT.is128BitVector() || VT.is256BitVector())
         && "Expected a 128-bit or 256-bit vector type");

  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                            Cst, Cst, Cst, Cst);

-  SDValue Vec;
  if (VT.is256BitVector()) {
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
-  } else
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+    Vec = Insert128BitVector(InsV, Vec,
+                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+  }
+
  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }

@ -12023,6 +12027,35 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+
+  if (VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+      ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+      ISD::isBuildVectorAllOnes(V2.getNode()))
+    return true;
+
+  return false;
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const X86Subtarget *Subtarget) {
@ -12047,12 +12080,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,

  // Check LHS for vnot
  if (N0.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

  // Check RHS for vnot
  if (N1.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

  return SDValue();
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -2450,6 +2450,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
    case X86::AVX_SET0PS:
    case X86::AVX_SET0PD:
    case X86::AVX_SET0PI:
+    case X86::AVX_SETALLONES:
      Alignment = 16;
      break;
    case X86::FsFLD0SD:
@ -2494,6 +2495,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
  case X86::AVX_SET0PI:
  case X86::AVX_SET0PSY:
  case X86::AVX_SET0PDY:
+  case X86::AVX_SETALLONES:
  case X86::FsFLD0SD:
  case X86::FsFLD0SS:
  case X86::VFsFLD0SD:
@ -2531,9 +2533,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
      Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
    else
      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
-    const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
-                    Constant::getAllOnesValue(Ty) :
-                    Constant::getNullValue(Ty);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);

    // Create operands to load from the constant pool entry.
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -3143,11 +3143,17 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
 // Alias instructions that map zero vector to pxor / xorp* for sse.
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementation, it does not expand the instructions below like
+// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
-  // FIXME: Change encoding to pseudo.
  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
+  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;

 //===---------------------------------------------------------------------===//
 // SSE3 - Conversion Instructions
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@ -381,6 +381,7 @@ ReSimplify:
  case X86::AVX_SET0PD:    LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break;
  case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
  case X86::AVX_SET0PI:    LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
+  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;

  case X86::MOV16r0:
    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
--- a/test/CodeGen/X86/avx-256.ll
+++ b/test/CodeGen/X86/avx-256.ll
@ -12,3 +12,15 @@ entry:
  store <4 x double> zeroinitializer, <4 x double>* @y, align 32
  ret void
 }
+
+; CHECK: vpcmpeqd
+; CHECK: vinsertf128 $1
+define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
+allocas:
+  %ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
+  store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
+0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
+0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x
+float>* %ptr2vec615, align 32
+  ret void
+}