Codegen allonesvector better while using AVX: vpcmpeqd + vinsertf128

This also fixes PR10452

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136004 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bruno Cardoso Lopes 2011-07-25 23:05:32 +00:00
parent 51e92e8e41
commit 863bd9d5cf
5 changed files with 70 additions and 13 deletions

View File

@ -3831,21 +3831,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
}
/// getOnesVector - Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
/// their original type, ensuring they get CSE'd.
/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
/// original type, ensuring they get CSE'd.
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
assert((VT.is128BitVector() || VT.is256BitVector())
&& "Expected a 128-bit or 256-bit vector type");
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
Cst, Cst, Cst, Cst);
SDValue Vec;
if (VT.is256BitVector()) {
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
} else
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
Vec = Insert128BitVector(InsV, Vec,
DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
}
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@ -12023,6 +12027,35 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
/// so it can be folded inside ANDNP.
static bool CanFoldXORWithAllOnes(const SDNode *N) {
EVT VT = N->getValueType(0);
// Match direct AllOnes for 128 and 256-bit vectors
if (ISD::isBuildVectorAllOnes(N))
return true;
// Look through a bit convert.
if (N->getOpcode() == ISD::BITCAST)
N = N->getOperand(0).getNode();
// Sometimes the operand may come from a insert_subvector building a 256-bit
// allones vector
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
if (VT.getSizeInBits() == 256 &&
N->getOpcode() == ISD::INSERT_SUBVECTOR &&
V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
V1.getOperand(0).getOpcode() == ISD::UNDEF &&
ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
ISD::isBuildVectorAllOnes(V2.getNode()))
return true;
return false;
}
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@ -12047,12 +12080,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
//ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
// Check RHS for vnot
if (N1.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
//ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
return SDValue();

View File

@ -2450,6 +2450,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
case X86::AVX_SET0PS:
case X86::AVX_SET0PD:
case X86::AVX_SET0PI:
case X86::AVX_SETALLONES:
Alignment = 16;
break;
case X86::FsFLD0SD:
@ -2494,6 +2495,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
case X86::AVX_SET0PI:
case X86::AVX_SET0PSY:
case X86::AVX_SET0PDY:
case X86::AVX_SETALLONES:
case X86::FsFLD0SD:
case X86::FsFLD0SS:
case X86::VFsFLD0SD:
@ -2531,9 +2533,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES);
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
// Create operands to load from the constant pool entry.

View File

@ -3143,11 +3143,17 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
// Alias instructions that map zero vector to pxor / xorp* for sse.
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-ones value if folding it would be beneficial.
// FIXME: Change encoding to pseudo! This is blocked right now by the x86
// JIT implementation, it does not expand the instructions below like
// X86MCInstLower does.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
// FIXME: Change encoding to pseudo.
def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
//===---------------------------------------------------------------------===//
// SSE3 - Conversion Instructions

View File

@ -381,6 +381,7 @@ ReSimplify:
case X86::AVX_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break;
case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
case X86::AVX_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
case X86::MOV16r0:
LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0

View File

@ -12,3 +12,15 @@ entry:
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
ret void
}
; CHECK: vpcmpeqd
; CHECK: vinsertf128 $1
define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
allocas:
%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x
float>* %ptr2vec615, align 32
ret void
}