diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 4e28dfe7fa8..b2f246e4419 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -205,6 +205,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeUNPCKHPMask(4, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERMILPSYri: + DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPDYri: + DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index cd06060748b..c1ff0e5011e 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -187,4 +187,31 @@ void DecodeUNPCKLPMask(EVT VT, } } +void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + DecodeVPERMILMask(MVT::getVectorVT(MVT::i32, NElts), Imm, ShuffleMask); +} + +void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + DecodeVPERMILMask(MVT::getVectorVT(MVT::i64, NElts), Imm, ShuffleMask); +} + +// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit +// with 32/64-bit elements. For 256-bit vectors, it's considered +// as two 128 lanes and the mask of the first lane should be +// identical of the second one. +void DecodeVPERMILMask(EVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = 0; i != NumElts/NumLanes; ++i) { + unsigned Idx = (Imm >> (i*2)) & 0x3 ; + ShuffleMask.push_back(Idx+(l*NumElts/NumLanes)); + } + } +} + } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index b18f6703309..4a5214028fa 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -82,6 +82,20 @@ void DecodeUNPCKLPDMask(unsigned NElts, void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl &ShuffleMask); + +void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask); + +void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, + SmallVectorImpl &ShuffleMask); + +// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit +// with 32/64-bit elements. For 256-bit vectors, it's considered +// as two 128 lanes and the mask of the first lane should be +// identical of the second one. +void DecodeVPERMILMask(EVT VT, unsigned Imm, + SmallVectorImpl &ShuffleMask); + } // llvm namespace #endif diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c4e44f999d0..1c827f0dd30 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2747,6 +2747,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PUNPCKHBW: case X86ISD::PUNPCKHDQ: case X86ISD::PUNPCKHQDQ: + case X86ISD::VPERMIL: return true; } return false; @@ -2772,6 +2773,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: + case X86ISD::VPERMIL: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3422,6 +3424,54 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } +/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMIL*. +static bool isVPERMILMask(const SmallVectorImpl &Mask, EVT VT) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + + // Match any permutation of 128-bit vector with 32/64-bit types + if (NumLanes == 1) { + if (NumElts == 4 || NumElts == 2) + return true; + return false; + } + + // Only match 256-bit with 32/64-bit types + if (NumElts != 8 && NumElts != 4) + return false; + + // The mask on the high lane should be the same as the low. Actually, + // they can differ if any of the corresponding index in a lane is undef. + int LaneSize = NumElts/NumLanes; + for (int i = 0; i < LaneSize; ++i) { + int HighElt = i+LaneSize; + if (Mask[i] < 0 || Mask[HighElt] < 0) + continue; + + if (Mask[HighElt]-Mask[i] != LaneSize) + return false; + } + + return true; +} + +/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMIL* instructions. +static unsigned getShuffleVPERMILImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast(N); + EVT VT = SVOp->getValueType(0); + + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + + unsigned Mask = 0; + for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i) + Mask |= SVOp->getMaskElt(i) << (i*2); + + return Mask; +} + /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. @@ -4097,6 +4147,10 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, Depth+1); } + case X86ISD::VPERMIL: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMILMask(VT, cast(ImmN)->getZExtValue(), + ShuffleMask); default: assert("not implemented for target shuffle node"); return SDValue(); @@ -6043,6 +6097,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (NumElems == 4) return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); + // Handle VPERMIL permutations + if (isVPERMILMask(M, VT)) { + unsigned TargetMask = getShuffleVPERMILImmediate(SVOp); + if (VT == MVT::v8f32) + return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG); + } + return SDValue(); } @@ -9660,6 +9721,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::VPERMIL: return "X86ISD::VPERMIL"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -12465,6 +12527,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VPERMIL: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 376aa8a4409..298d45110ff 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -271,6 +271,7 @@ namespace llvm { PUNPCKHWD, PUNPCKHDQ, PUNPCKHQDQ, + VPERMIL, // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index b00109c9fa4..f792f53b7ba 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -150,6 +150,8 @@ def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>; def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>; def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>; +def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ae26a80fe2d..73e465fe29e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5529,6 +5529,10 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", // The AVX version of some but not all of them are described here, and more // should come in a near future. +// Shuffle with VPERMIL instructions +def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; + // Shuffle with PSHUFD instruction folding loads. The first two patterns match // SSE2 loads, which are always promoted to v2i64. The last one should match // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD diff --git a/test/CodeGen/X86/avx-256-splat.ll b/test/CodeGen/X86/avx-256-splat.ll new file mode 100644 index 00000000000..27ff9268b40 --- /dev/null +++ b/test/CodeGen/X86/avx-256-splat.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; FIXME: use avx versions for punpcklbw and punpckhbw + +; CHECK: vextractf128 $0 +; CHECK-NEXT: punpcklbw +; CHECK-NEXT: punpckhbw +; CHECK-NEXT: vinsertf128 $0 +; CHECK-NEXT: vinsertf128 $1 +; CHECK-NEXT: vpermilps $85 +define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> + ret <32 x i8> %shuffle +} +