From cbbe33fde4f484ffdc9a171b69d6f98eb89dabd9 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 18 Nov 2011 02:49:55 +0000 Subject: [PATCH] Add AVX2 vpbroadcast support git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144967 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 43 ++++++--- lib/Target/X86/X86InstrSSE.td | 47 ++++++--- test/CodeGen/X86/avx2-vbroadcast.ll | 142 ++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+), 28 deletions(-) create mode 100644 test/CodeGen/X86/avx2-vbroadcast.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4986aac04f2..6a14f220a47 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5115,9 +5115,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, /// 1. A splat BUILD_VECTOR which uses a single scalar load. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load. -/// The scalar load node is returned when a pattern is found, -/// or SDValue() otherwise. -static SDValue isVectorBroadcast(SDValue &Op) { +/// The scalar load node is returned when a pattern is found, +/// or SDValue() otherwise. +static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) { EVT VT = Op.getValueType(); SDValue V = Op; @@ -5134,16 +5134,16 @@ static SDValue isVectorBroadcast(SDValue &Op) { case ISD::BUILD_VECTOR: { // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(V.getNode())) + if (!isSplatVector(V.getNode())) return SDValue(); Ld = V.getOperand(0); - - // The suspected load node has several users. Make sure that all + + // The suspected load node has several users. Make sure that all // of its users are from the BUILD_VECTOR node. - if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) return SDValue(); - break; + break; } case ISD::VECTOR_SHUFFLE: { @@ -5151,11 +5151,11 @@ static SDValue isVectorBroadcast(SDValue &Op) { // Shuffles must have a splat mask where the first element is // broadcasted. - if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) return SDValue(); SDValue Sc = Op.getOperand(0); - if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) return SDValue(); Ld = Sc.getOperand(0); @@ -5167,15 +5167,27 @@ static SDValue isVectorBroadcast(SDValue &Op) { break; } } - + // The scalar source must be a normal load. - if (!ISD::isNormalLoad(Ld.getNode())) + if (!ISD::isNormalLoad(Ld.getNode())) return SDValue(); - + bool Is256 = VT.getSizeInBits() == 256; bool Is128 = VT.getSizeInBits() == 128; unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + if (hasAVX2) { + // VBroadcast to YMM + if (Is256 && (ScalarSize == 8 || ScalarSize == 16 || + ScalarSize == 32 || ScalarSize == 64 )) + return Ld; + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 8 || ScalarSize == 32 || + ScalarSize == 16 || ScalarSize == 64 )) + return Ld; + } + // VBroadcast to YMM if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) return Ld; @@ -5184,6 +5196,7 @@ static SDValue isVectorBroadcast(SDValue &Op) { if (Is128 && (ScalarSize == 32)) return Ld; + // Unsupported broadcast. return SDValue(); } @@ -5216,7 +5229,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getOnesVector(Op.getValueType(), DAG, dl); } - SDValue LD = isVectorBroadcast(Op); + SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); if (Subtarget->hasAVX() && LD.getNode()) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); @@ -6613,7 +6626,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, return Op; // Use vbroadcast whenever the splat comes from a foldable load - SDValue LD = isVectorBroadcast(Op); + SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); if (Subtarget->hasAVX() && LD.getNode()) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 11f4785b5e2..e5957508501 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7189,19 +7189,6 @@ def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; -def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VBROADCASTSDrm addr:$src)>; -def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDrm addr:$src)>; - -def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VBROADCASTSSrm addr:$src)>; //===----------------------------------------------------------------------===// // VINSERTF128 - Insert packed floating-point values @@ -7557,6 +7544,40 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, int_x86_avx2_pbroadcastq_128, int_x86_avx2_pbroadcastq_256>; +let Predicates = [HasAVX2] in { + def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), + (VPBROADCASTBrm addr:$src)>; + def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), + (VPBROADCASTBYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDYrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VPBROADCASTQYrm addr:$src)>; +} + +// AVX1 broadcast patterns +def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VBROADCASTSDrm addr:$src)>; +def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDrm addr:$src)>; + +def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; + //===----------------------------------------------------------------------===// // VPERM - Permute instructions // diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll new file mode 100644 index 00000000000..142be33c5e2 --- /dev/null +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -0,0 +1,142 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +; CHECK: vpbroadcastb (% +define <16 x i8> @BB16(i8* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i8* %ptr, align 4 + %q0 = insertelement <16 x i8> undef, i8 %q, i32 0 + %q1 = insertelement <16 x i8> %q0, i8 %q, i32 1 + %q2 = insertelement <16 x i8> %q1, i8 %q, i32 2 + %q3 = insertelement <16 x i8> %q2, i8 %q, i32 3 + %q4 = insertelement <16 x i8> %q3, i8 %q, i32 4 + %q5 = insertelement <16 x i8> %q4, i8 %q, i32 5 + %q6 = insertelement <16 x i8> %q5, i8 %q, i32 6 + %q7 = insertelement <16 x i8> %q6, i8 %q, i32 7 + %q8 = insertelement <16 x i8> %q7, i8 %q, i32 8 + %q9 = insertelement <16 x i8> %q8, i8 %q, i32 9 + %qa = insertelement <16 x i8> %q9, i8 %q, i32 10 + %qb = insertelement <16 x i8> %qa, i8 %q, i32 11 + %qc = insertelement <16 x i8> %qb, i8 %q, i32 12 + %qd = insertelement <16 x i8> %qc, i8 %q, i32 13 + %qe = insertelement <16 x i8> %qd, i8 %q, i32 14 + %qf = insertelement <16 x i8> %qe, i8 %q, i32 15 + ret <16 x i8> %qf +} +; CHECK: vpbroadcastb (% +define <32 x i8> @BB32(i8* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i8* %ptr, align 4 + %q0 = insertelement <32 x i8> undef, i8 %q, i32 0 + %q1 = insertelement <32 x i8> %q0, i8 %q, i32 1 + %q2 = insertelement <32 x i8> %q1, i8 %q, i32 2 + %q3 = insertelement <32 x i8> %q2, i8 %q, i32 3 + %q4 = insertelement <32 x i8> %q3, i8 %q, i32 4 + %q5 = insertelement <32 x i8> %q4, i8 %q, i32 5 + %q6 = insertelement <32 x i8> %q5, i8 %q, i32 6 + %q7 = insertelement <32 x i8> %q6, i8 %q, i32 7 + %q8 = insertelement <32 x i8> %q7, i8 %q, i32 8 + %q9 = insertelement <32 x i8> %q8, i8 %q, i32 9 + %qa = insertelement <32 x i8> %q9, i8 %q, i32 10 + %qb = insertelement <32 x i8> %qa, i8 %q, i32 11 + %qc = insertelement <32 x i8> %qb, i8 %q, i32 12 + %qd = insertelement <32 x i8> %qc, i8 %q, i32 13 + %qe = insertelement <32 x i8> %qd, i8 %q, i32 14 + %qf = insertelement <32 x i8> %qe, i8 %q, i32 15 + + %q20 = insertelement <32 x i8> %qf, i8 %q, i32 16 + %q21 = insertelement <32 x i8> %q20, i8 %q, i32 17 + %q22 = insertelement <32 x i8> %q21, i8 %q, i32 18 + %q23 = insertelement <32 x i8> %q22, i8 %q, i32 19 + %q24 = insertelement <32 x i8> %q23, i8 %q, i32 20 + %q25 = insertelement <32 x i8> %q24, i8 %q, i32 21 + %q26 = insertelement <32 x i8> %q25, i8 %q, i32 22 + %q27 = insertelement <32 x i8> %q26, i8 %q, i32 23 + %q28 = insertelement <32 x i8> %q27, i8 %q, i32 24 + %q29 = insertelement <32 x i8> %q28, i8 %q, i32 25 + %q2a = insertelement <32 x i8> %q29, i8 %q, i32 26 + %q2b = insertelement <32 x i8> %q2a, i8 %q, i32 27 + %q2c = insertelement <32 x i8> %q2b, i8 %q, i32 28 + %q2d = insertelement <32 x i8> %q2c, i8 %q, i32 29 + %q2e = insertelement <32 x i8> %q2d, i8 %q, i32 30 + %q2f = insertelement <32 x i8> %q2e, i8 %q, i32 31 + ret <32 x i8> %q2f +} +; CHECK: vpbroadcastw (% + +define <8 x i16> @W16(i16* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i16* %ptr, align 4 + %q0 = insertelement <8 x i16> undef, i16 %q, i32 0 + %q1 = insertelement <8 x i16> %q0, i16 %q, i32 1 + %q2 = insertelement <8 x i16> %q1, i16 %q, i32 2 + %q3 = insertelement <8 x i16> %q2, i16 %q, i32 3 + %q4 = insertelement <8 x i16> %q3, i16 %q, i32 4 + %q5 = insertelement <8 x i16> %q4, i16 %q, i32 5 + %q6 = insertelement <8 x i16> %q5, i16 %q, i32 6 + %q7 = insertelement <8 x i16> %q6, i16 %q, i32 7 + ret <8 x i16> %q7 +} +; CHECK: vpbroadcastw (% +define <16 x i16> @WW16(i16* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i16* %ptr, align 4 + %q0 = insertelement <16 x i16> undef, i16 %q, i32 0 + %q1 = insertelement <16 x i16> %q0, i16 %q, i32 1 + %q2 = insertelement <16 x i16> %q1, i16 %q, i32 2 + %q3 = insertelement <16 x i16> %q2, i16 %q, i32 3 + %q4 = insertelement <16 x i16> %q3, i16 %q, i32 4 + %q5 = insertelement <16 x i16> %q4, i16 %q, i32 5 + %q6 = insertelement <16 x i16> %q5, i16 %q, i32 6 + %q7 = insertelement <16 x i16> %q6, i16 %q, i32 7 + %q8 = insertelement <16 x i16> %q7, i16 %q, i32 8 + %q9 = insertelement <16 x i16> %q8, i16 %q, i32 9 + %qa = insertelement <16 x i16> %q9, i16 %q, i32 10 + %qb = insertelement <16 x i16> %qa, i16 %q, i32 11 + %qc = insertelement <16 x i16> %qb, i16 %q, i32 12 + %qd = insertelement <16 x i16> %qc, i16 %q, i32 13 + %qe = insertelement <16 x i16> %qd, i16 %q, i32 14 + %qf = insertelement <16 x i16> %qe, i16 %q, i32 15 + ret <16 x i16> %qf +} +; CHECK: vpbroadcastd (% +define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i32* %ptr, align 4 + %q0 = insertelement <4 x i32> undef, i32 %q, i32 0 + %q1 = insertelement <4 x i32> %q0, i32 %q, i32 1 + %q2 = insertelement <4 x i32> %q1, i32 %q, i32 2 + %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3 + ret <4 x i32> %q3 +} +; CHECK: vpbroadcastd (% +define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i32* %ptr, align 4 + %q0 = insertelement <8 x i32> undef, i32 %q, i32 0 + %q1 = insertelement <8 x i32> %q0, i32 %q, i32 1 + %q2 = insertelement <8 x i32> %q1, i32 %q, i32 2 + %q3 = insertelement <8 x i32> %q2, i32 %q, i32 3 + %q4 = insertelement <8 x i32> %q3, i32 %q, i32 4 + %q5 = insertelement <8 x i32> %q4, i32 %q, i32 5 + %q6 = insertelement <8 x i32> %q5, i32 %q, i32 6 + %q7 = insertelement <8 x i32> %q6, i32 %q, i32 7 + ret <8 x i32> %q7 +} +; CHECK: vpbroadcastq (% +define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i64* %ptr, align 4 + %q0 = insertelement <2 x i64> undef, i64 %q, i32 0 + %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1 + ret <2 x i64> %q1 +} +; CHECK: vpbroadcastq (% +define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp { +entry: + %q = load i64* %ptr, align 4 + %q0 = insertelement <4 x i64> undef, i64 %q, i32 0 + %q1 = insertelement <4 x i64> %q0, i64 %q, i32 1 + %q2 = insertelement <4 x i64> %q1, i64 %q, i32 2 + %q3 = insertelement <4 x i64> %q2, i64 %q, i32 3 + ret <4 x i64> %q3 +}