diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1ed1ee77e46..e72c8d5f2d4 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4520,8 +4520,10 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Op = N0.getOperand(0); if (Op.getValueType().bitsLT(VT)) { Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op); + AddToWorkList(Op.getNode()); } else if (Op.getValueType().bitsGT(VT)) { Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op); + AddToWorkList(Op.getNode()); } return DAG.getZeroExtendInReg(Op, N->getDebugLoc(), N0.getValueType().getScalarType()); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a03b97f3215..5e52b84efd3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1222,6 +1222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); @@ -13033,6 +13034,20 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + if (Subtarget->hasAVX2()) { + // AVX2: v4i64 -> v4i32 + + // VPERMD + static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); + Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), + ShufMask); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, DAG.getIntPtrConstant(0)); + } + + // AVX: v4i64 -> v4i32 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, DAG.getIntPtrConstant(0)); @@ -13057,6 +13072,40 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, } if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + if (Subtarget->hasAVX2()) { + // AVX2: v8i32 -> v8i16 + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); + // PSHUFB + SmallVector pshufbMask; + for (unsigned i = 0; i < 2; ++i) { + pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); + for (unsigned j = 0; j < 8; ++j) + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, &pshufbMask[0], + 32); + Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); + + static const int ShufMask[] = {0, 2, -1, -1}; + Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64), + &ShufMask[0]); + + Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + return DAG.getNode(ISD::BITCAST, dl, VT, Op); + } + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, DAG.getIntPtrConstant(0)); @@ -14822,15 +14871,6 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasAVX()) return SDValue(); - // Optimize vectors in AVX mode - // Sign extend v8i16 to v8i32 and - // v4i32 to v4i64 - // - // Divide input vector into two parts - // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} - // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 - // concat the vectors to original VT - EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); EVT OpVT = Op.getValueType(); @@ -14839,6 +14879,19 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { + if (Subtarget->hasAVX2()) { + return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); + } + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + unsigned NumElems = OpVT.getVectorNumElements(); SmallVector ShufMask1(NumElems, -1); for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; @@ -14906,6 +14959,9 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); + SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); @@ -15108,6 +15164,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); + case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 35801e43229..ffc6cbea3f1 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -71,9 +71,14 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS", SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; + +def X86vzmovly : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisOpSmallerThanOp<1, 0> ]>>; + def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>; - + def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 65e3c1e19fa..450d29a8574 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5730,14 +5730,26 @@ let Predicates = [HasSSE41] in { (PMOVZXDQrm addr:$src)>; } +let Predicates = [HasAVX2] in { + let AddedComplexity = 15 in { + def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))), + (VPMOVZXDQYrr VR128:$src)>; + def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))), + (VPMOVZXWDYrr VR128:$src)>; + } + + def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; + def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; +} + let Predicates = [HasAVX] in { -def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; -def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; } let Predicates = [HasSSE41] in { -def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; -def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; } diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll new file mode 100755 index 00000000000..fe87de936a3 --- /dev/null +++ b/test/CodeGen/X86/avx2-conversions.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +; CHECK: trunc4 +; CHECK: vpermd +; CHECK-NOT: vinsert +; CHECK: ret +define <4 x i32> @trunc4(<4 x i64> %A) nounwind { + %B = trunc <4 x i64> %A to <4 x i32> + ret <4 x i32>%B +} + +; CHECK: trunc8 +; CHECK: vpshufb +; CHECK-NOT: vinsert +; CHECK: ret + +define <8 x i16> @trunc8(<8 x i32> %A) nounwind { + %B = trunc <8 x i32> %A to <8 x i16> + ret <8 x i16>%B +} + +; CHECK: sext4 +; CHECK: vpmovsxdq +; CHECK-NOT: vinsert +; CHECK: ret +define <4 x i64> @sext4(<4 x i32> %A) nounwind { + %B = sext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +; CHECK: sext8 +; CHECK: vpmovsxwd +; CHECK-NOT: vinsert +; CHECK: ret +define <8 x i32> @sext8(<8 x i16> %A) nounwind { + %B = sext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +; CHECK: zext4 +; CHECK: vpmovzxdq +; CHECK-NOT: vinsert +; CHECK: ret +define <4 x i64> @zext4(<4 x i32> %A) nounwind { + %B = zext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +; CHECK: zext8 +; CHECK: vpmovzxwd +; CHECK-NOT: vinsert +; CHECK: ret +define <8 x i32> @zext8(<8 x i16> %A) nounwind { + %B = zext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} +; CHECK: zext_8i8_8i32 +; CHECK: vpmovzxwd +; CHECK: vpand +; CHECK: ret +define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { + %B = zext <8 x i8> %A to <8 x i32> + ret <8 x i32>%B +} + + + +