diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6099ba93260..cde20c3cb37 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2850,6 +2850,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPS: case X86ISD::VPERMILPD: case X86ISD::VPERM2F128: + case X86ISD::VPERM2I128: return true; } return false; @@ -2891,6 +2892,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::SHUFPD: case X86ISD::SHUFPS: case X86ISD::VPERM2F128: + case X86ISD::VPERM2I128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3283,8 +3285,8 @@ static bool isVSHUFPYMask(const SmallVectorImpl &Mask, EVT VT, return false; // For VSHUFPSY, the mask of the second half must be the same as the first - // but with // the appropriate offsets. This works in the same way as - // VPERMILPS // works with masks. + // but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) return false; @@ -3358,8 +3360,8 @@ static bool isCommutedVSHUFPY(ShuffleVectorSDNode *N, bool HasAVX) { return false; // For VSHUFPSY, the mask of the second half must be the same as the first - // but with // the appropriate offsets. This works in the same way as - // VPERMILPS // works with masks. + // but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) return false; @@ -3753,15 +3755,15 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) { return ::isMOVLMask(M, N->getValueType(0)); } -/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2F128Mask(const SmallVectorImpl &Mask, EVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +static bool isVPERM2X128Mask(const SmallVectorImpl &Mask, EVT VT, + bool HasAVX) { + if (!HasAVX || VT.getSizeInBits() != 256) return false; // The shuffle result is divided into half A and half B. In total the two @@ -3789,9 +3791,9 @@ static bool isVPERM2F128Mask(const SmallVectorImpl &Mask, EVT VT, return MatchA && MatchB; } -/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128 instructions. -static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { +/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. +static unsigned getShuffleVPERM2X128Immediate(SDNode *N) { ShuffleVectorSDNode *SVOp = cast(N); EVT VT = SVOp->getValueType(0); @@ -3814,80 +3816,47 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { return (FstHalf | (SndHalf << 4)); } -/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. -static bool isVPERMILPDMask(const SmallVectorImpl &Mask, EVT VT, - const X86Subtarget *Subtarget) { +static bool isVPERMILPMask(const SmallVectorImpl &Mask, EVT VT, + bool HasAVX) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits()/128; - if (!Subtarget->hasAVX()) + if (!HasAVX) return false; - // Only match 256-bit with 64-bit types - if (VT.getSizeInBits() != 256 || NumElts != 4) + // Only match 256-bit with 32/64-bit types + if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) return false; - // The mask on the high lane is independent of the low. Both can match - // any element in inside its own lane, but can't cross. int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int LaneStart = l*LaneSize; - if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) + for (int l = 0; l != NumLanes; ++l) { + int LaneStart = l*LaneSize; + for (int i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize)) + return false; + if (NumElts == 4 || l == 0) + continue; + // VPERMILPS handling + if (Mask[i] < 0) + continue; + if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize)) return false; } - - return true; -} - -/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPSMask(const SmallVectorImpl &Mask, EVT VT, - const X86Subtarget *Subtarget) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) - return false; - - // Only match 256-bit with 32-bit types - if (VT.getSizeInBits() != 256 || NumElts != 8) - return false; - - // The mask on the high lane should be the same as the low. Actually, - // they can differ if any of the corresponding index in a lane is undef - // and the other stays in range. - int LaneSize = NumElts/NumLanes; - for (int i = 0; i < LaneSize; ++i) { - int HighElt = i+LaneSize; - bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); - bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); - - if (!HighValid || !LowValid) - return false; - if (Mask[i] < 0 || Mask[HighElt] < 0) - continue; - if (Mask[HighElt]-Mask[i] != LaneSize) - return false; } return true; } -/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPS* instructions. -static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { +/// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPS/D* instructions. +static unsigned getShuffleVPERMILPImmediate(SDNode *N) { ShuffleVectorSDNode *SVOp = cast(N); EVT VT = SVOp->getValueType(0); @@ -3899,43 +3868,22 @@ static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { // where a mask will match because the same mask element is undef on the // first half but valid on the second. This would get pathological cases // such as: shuffle , which is completely valid. + unsigned Shift = (LaneSize == 4) ? 2 : 1; unsigned Mask = 0; - for (int l = 0; l < NumLanes; ++l) { - for (int i = 0; i < LaneSize; ++i) { - int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); - if (MaskElt < 0) - continue; - if (MaskElt >= LaneSize) - MaskElt -= LaneSize; - Mask |= MaskElt << (i*2); - } + for (int i = 0; i != NumElts; ++i) { + int MaskElt = SVOp->getMaskElt(i); + if (MaskElt < 0) + continue; + MaskElt %= LaneSize; + unsigned Shamt = i; + // VPERMILPSY, the mask of the first half must be equal to the second one + if (NumElts == 8) Shamt %= LaneSize; + Mask |= MaskElt << (Shamt*Shift); } return Mask; } -/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPD* instructions. -static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - unsigned Mask = 0; - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) - continue; - Mask |= (MaskElt-l*LaneSize) << i; - } - - return Mask; -} - /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. @@ -4677,6 +4625,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, ShuffleMask); break; case X86ISD::VPERM2F128: + case X86ISD::VPERM2I128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2F128Mask(VT, cast(ImmN)->getZExtValue(), ShuffleMask); @@ -6596,6 +6545,22 @@ static inline unsigned getVPERMILOpcode(EVT VT) { return 0; } +static inline unsigned getVPERM2X128Opcode(EVT VT, bool HasAVX2) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + if (HasAVX2) return X86ISD::VPERM2I128; + // else use fp unit for int vperm + case MVT::v8f32: + case MVT::v4f64: return X86ISD::VPERM2F128; + default: + llvm_unreachable("Unknown type for vpermil"); + } + return 0; +} + static SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI, @@ -6910,22 +6875,17 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (isMOVDDUPYMask(SVOp, Subtarget)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - // Handle VPERMILPS* permutations - if (isVPERMILPSMask(M, VT, Subtarget)) + // Handle VPERMILPS/D* permutations + if (isVPERMILPMask(M, VT, Subtarget->hasAVX())) return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPSImmediate(SVOp), DAG); + getShuffleVPERMILPImmediate(SVOp), DAG); - // Handle VPERMILPD* permutations - if (isVPERMILPDMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPDImmediate(SVOp), DAG); + // Handle VPERM2F128/VPERM2I128 permutations + if (isVPERM2X128Mask(M, VT, Subtarget->hasAVX())) + return getTargetShuffleNode(getVPERM2X128Opcode(VT, HasAVX2), dl, VT, V1, + V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VPERM2F128 permutations - if (isVPERM2F128Mask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, - getShuffleVPERM2F128Immediate(SVOp), DAG); - - // Handle VSHUFPSY permutations + // Handle VSHUFPS/DY permutations if (isVSHUFPYMask(M, VT, Subtarget->hasAVX())) return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, getShuffleVSHUFPYImmediate(SVOp), DAG); @@ -11223,6 +11183,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; + case X86ISD::VPERM2I128: return "X86ISD::VPERM2I128"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -14810,6 +14771,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILPS: case X86ISD::VPERMILPD: case X86ISD::VPERM2F128: + case X86ISD::VPERM2I128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 638f80c6e00..e0a0e295783 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -280,6 +280,7 @@ namespace llvm { VPERMILPS, VPERMILPD, VPERM2F128, + VPERM2I128, VBROADCAST, // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index db2b652c290..ff9c1433fad 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -140,6 +140,7 @@ def X86VPermilps : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>; def X86VPermilpd : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>; def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>; +def X86VPerm2i128 : SDNode<"X86ISD::VPERM2I128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2b240fc03a4..28515ba1d6b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7359,7 +7359,7 @@ def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, (memopv4f64 addr:$src2), imm:$src3), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; def : Pat<(int_x86_avx_vperm2f128_si_256 - VR256:$src1, (memopv8i32 addr:$src2), imm:$src3), + VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), imm:$src3), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), @@ -7375,6 +7375,25 @@ def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, + (memopv8f32 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, + (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; + //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // @@ -7571,7 +7590,7 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>, VEX_W; //===----------------------------------------------------------------------===// -// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks +// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), @@ -7587,6 +7606,30 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), imm:$src3))]>, VEX_4V; +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2i128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v32i8 (X86VPerm2i128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2i128 VR256:$src1, + (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2i128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2i128 VR256:$src1, (memopv4i64 addr:$src2), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + //===----------------------------------------------------------------------===// // VINSERTI128 - Insert packed integer values // diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll new file mode 100644 index 00000000000..0f24ac33316 --- /dev/null +++ b/test/CodeGen/X86/avx2-vperm2i128.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +; CHECK: vperm2i128 $17 +define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +; CHECK: vperm2i128 $33 +define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + +; CHECK: vperm2i128 $49 +define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +; CHECK: vperm2i128 $2 +define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { +entry: + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + +; CHECK: vperm2i128 $2, (% +define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { +entry: + %c = load <16 x i16>* %a + %d = load <16 x i16>* %b + %shuffle = shufflevector <16 x i16> %c, <16 x i16> %d, <16 x i32> + ret <16 x i16> %shuffle +}