diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a214c06c3b5..7df931fdd4e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3251,7 +3251,7 @@ static bool isPALIGNRMask(const SmallVectorImpl &Mask, EVT VT, /// specifies a shuffle of elements that is suitable for input to 256-bit /// VSHUFPSY. static bool isVSHUFPYMask(const SmallVectorImpl &Mask, EVT VT, - bool HasAVX) { + bool HasAVX, bool Commuted = false) { int NumElems = VT.getVectorNumElements(); if (!HasAVX || VT.getSizeInBits() != 256) @@ -3279,114 +3279,27 @@ static bool isVSHUFPYMask(const SmallVectorImpl &Mask, EVT VT, // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - - // For VSHUFPSY, the mask of the second half must be the same as the first - // but with the appropriate offsets. This works in the same way as - // VPERMILPS works with masks. - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - if (NumElems == 4) - continue; - // VSHUFPSY handling - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - for (int i = QuarterSize*3; i < NumElems; ++i) { - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - int FstHalfIdx = i-HalfSize; - if (NumElems == 4) - continue; - // VSHUFPSY handling - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - - return true; -} - -/// isCommutedVSHUFP() - Returns true if the shuffle mask is exactly -/// the reverse of what x86 shuffles want. x86 shuffles requires the lower -/// half elements to come from vector 1 (which would equal the dest.) and -/// the upper half to come from vector 2. -static bool isCommutedVSHUFPYMask(const SmallVectorImpl &Mask, EVT VT, - bool HasAVX) { - int NumElems = VT.getVectorNumElements(); - - if (!HasAVX || VT.getSizeInBits() != 256) - return false; - - if (NumElems != 4 && NumElems != 8) - return false; - - // VSHUFPSY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 - // - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, - // Y3..Y0, Y3..Y0, X3..X0, X3..X0 - // - // VSHUFPDY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X3 X2 X1 X0 - // SRC2 => Y3 Y2 Y1 Y0 - // - // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 - // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - - // For VSHUFPSY, the mask of the second half must be the same as the first - // but with the appropriate offsets. This works in the same way as - // VPERMILPS works with masks. - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - if (NumElems == 4) - continue; - // VSHUFPSY handling - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - for (int i = QuarterSize*3; i < NumElems; ++i) { - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - if (NumElems == 4) - continue; - // VSHUFPSY handling - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; + unsigned QuarterSize = NumElems/4; + unsigned HalfSize = QuarterSize*2; + for (unsigned l = 0; l != 2; ++l) { + unsigned LaneStart = l*HalfSize; + for (unsigned s = 0; s != 2; ++s) { + unsigned QuarterStart = s*QuarterSize; + unsigned Src = (Commuted) ? (1-s) : s; + unsigned SrcStart = Src*NumElems + LaneStart; + for (unsigned i = 0; i != QuarterSize; ++i) { + int Idx = Mask[i+QuarterStart+LaneStart]; + if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the first + // but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize)) + return false; + } + } } return true; @@ -3436,9 +3349,11 @@ static void CommuteVectorShuffleMask(SmallVectorImpl &Mask, /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit -/// SHUFPS and SHUFPD. -static bool isSHUFPMask(const SmallVectorImpl &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(const SmallVectorImpl &Mask, EVT VT, + bool Commuted = false) { + unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) return false; @@ -3446,12 +3361,14 @@ static bool isSHUFPMask(const SmallVectorImpl &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4) return false; - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) + unsigned Half = NumElems / 2; + unsigned SrcStart = Commuted ? NumElems : 0; + for (unsigned i = 0; i != Half; ++i) + if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) + SrcStart = Commuted ? 0 : NumElems; + for (unsigned i = Half; i != NumElems; ++i) + if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems)) return false; return true; @@ -3463,26 +3380,6 @@ bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { return ::isSHUFPMask(M, N->getValueType(0)); } -/// isCommutedSHUFPMask - Returns true if the shuffle mask is exactly -/// the reverse of what x86 shuffles want. x86 shuffles requires the lower -/// half elements to come from vector 1 (which would equal the dest.) and -/// the upper half to come from vector 2. -static bool isCommutedSHUFPMask(const SmallVectorImpl &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; - return true; -} - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { @@ -6780,8 +6677,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isCommutedSHUFPMask(M, VT) || - isCommutedVSHUFPYMask(M, VT, HasAVX))) + if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) || + isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are @@ -11272,7 +11169,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || isSHUFPMask(Mask, VT) || - isCommutedSHUFPMask(Mask, VT)); + isSHUFPMask(Mask, VT, /* Commuted */ true)); } return false; } diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll index 3550a908231..caa21e5bacf 100644 --- a/test/CodeGen/X86/avx-vperm2f128.ll +++ b/test/CodeGen/X86/avx-vperm2f128.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s +; CHECK: _A ; CHECK: vperm2f128 $1 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: @@ -7,6 +8,7 @@ entry: ret <8 x float> %shuffle } +; CHECK: _B ; CHECK: vperm2f128 $48 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: @@ -14,6 +16,7 @@ entry: ret <8 x float> %shuffle } +; CHECK: _C ; CHECK: vperm2f128 $0 define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: @@ -21,6 +24,7 @@ entry: ret <8 x float> %shuffle } +; CHECK: _D ; CHECK: vperm2f128 $17 define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: @@ -28,6 +32,7 @@ entry: ret <8 x float> %shuffle } +; CHECK: _E ; CHECK: vperm2f128 $17 define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { entry: @@ -35,7 +40,8 @@ entry: ret <32 x i8> %shuffle } -; CHECK: vperm2f128 $33 +; CHECK: _E2 +; CHECK: vperm2f128 $3 define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -44,6 +50,7 @@ entry: ;;;; Cases with undef indicies mixed in the mask +; CHECK: _F ; CHECK: vperm2f128 $33 define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll index 6fcd2d7c1b6..1937db5d7c1 100644 --- a/test/CodeGen/X86/avx2-vperm2i128.ll +++ b/test/CodeGen/X86/avx2-vperm2i128.ll @@ -9,7 +9,7 @@ entry: ret <32 x i8> %shuffle } -; CHECK: vperm2i128 $33 +; CHECK: vperm2i128 $3 define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: ; add forces execution domain