diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 592e5b37ac8..00b4976b505 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5406,7 +5406,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getValueType().getSimpleVT(); SDLoc dl(Op); - assert((VT.is128BitVector() || VT.is256BitVector()) && + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; @@ -5462,13 +5462,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. - if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + + // AVX-512 has register version of the broadcast + bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && + Ld.getValueType().getSizeInBits() >= 32; + if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && + !hasRegVer)) return SDValue(); break; } } - bool Is256 = VT.is256BitVector(); + bool IsGE256 = (VT.getSizeInBits() >= 256); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -5478,7 +5483,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { const Constant *C = 0; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); @@ -5502,14 +5507,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && - (ScalarSize == 32 || (Is256 && ScalarSize == 64))) + (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -13230,6 +13235,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 487d82947d7..270327495d0 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -321,6 +321,8 @@ namespace llvm { VPERMI, VPERM2X128, VBROADCAST, + // masked broadcast + VBROADCASTM, // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, @@ -852,7 +854,9 @@ namespace llvm { SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const; SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 18ccdc35ae5..8abae14f946 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -347,6 +347,132 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX; +//===---------------------------------------------------------------------===// +// AVX-512 BROADCAST +//--- +multiclass avx512_fp_broadcast opc, string OpcodeStr, + RegisterClass DestRC, + RegisterClass SrcRC, X86MemOperand x86memop> { + def rr : AVX5128I, EVEX; + def rm : AVX5128I, EVEX; +} +let ExeDomain = SSEPackedSingle in { + defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512, + VR128X, f32mem>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +} + +let ExeDomain = SSEPackedDouble in { + defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512, + VR128X, f64mem>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSZrm addr:$src)>; +def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDZrm addr:$src)>; + +multiclass avx512_int_broadcast_reg opc, string OpcodeStr, + RegisterClass SrcRC, RegisterClass KRC> { + def Zrr : AVX5128I, EVEX, EVEX_V512; + def Zkrr : AVX5128I, EVEX, EVEX_V512, EVEX_KZ; +} + +defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; +defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, + VEX_W; + +def : Pat <(v16i32 (X86vzext VK16WM:$mask)), + (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + +def : Pat <(v8i64 (X86vzext VK8WM:$mask)), + (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + +def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), + (VPBROADCASTDrZrr GR32:$src)>; +def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), + (VPBROADCASTQrZrr GR64:$src)>; + +multiclass avx512_int_broadcast_rm opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, + RegisterClass KRC> { + def rr : AVX5128I, EVEX; + def krr : AVX5128I, + EVEX, EVEX_KZ; + def rm : AVX5128I, EVEX; + def krm : AVX5128I, EVEX, EVEX_KZ; +} + +defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, + loadi32, VR512, v16i32, v4i32, VK16WM>, + EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, + loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, + EVEX_CD8<64, CD8VT1>; + +def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), + (VBROADCASTSSZrr VR128X:$src)>; +def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), + (VBROADCASTSDZrr VR128X:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), + (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; +def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), + (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + +let Predicates = [HasAVX512] in { +def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), + (EXTRACT_SUBREG + (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + addr:$src)), sub_ymm)>; +} +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST MASK TO VECTOR REGISTER +//--- + +multiclass avx512_mask_broadcast opc, string OpcodeStr, + RegisterClass DstRC, RegisterClass KRC, + ValueType OpVT, ValueType SrcVT> { +def rr : AVX512XS8I, EVEX; +} + +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, + VK16, v16i32, v16i1>, EVEX_V512; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, + VK8, v8i64, v8i1>, EVEX_V512, VEX_W; + // Mask register copy, including // - copy between mask registers // - load/store mask registers diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index db53af00b56..0b51521feea 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -157,7 +157,9 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; -def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; + def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; @@ -196,6 +198,7 @@ def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll new file mode 100644 index 00000000000..4f07f942f2e --- /dev/null +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +;CHECK: _inreg16xi32 +;CHECK: vpbroadcastd {{.*}}, %zmm +;CHECK: ret +define <16 x i32> @_inreg16xi32(i32 %a) { + %b = insertelement <16 x i32> undef, i32 %a, i32 0 + %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + ret <16 x i32> %c +} + +;CHECK: _inreg8xi64 +;CHECK: vpbroadcastq {{.*}}, %zmm +;CHECK: ret +define <8 x i64> @_inreg8xi64(i64 %a) { + %b = insertelement <8 x i64> undef, i64 %a, i32 0 + %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer + ret <8 x i64> %c +} + +;CHECK: _inreg16xfloat +;CHECK: vbroadcastssz {{.*}}, %zmm +;CHECK: ret +define <16 x float> @_inreg16xfloat(float %a) { + %b = insertelement <16 x float> undef, float %a, i32 0 + %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer + ret <16 x float> %c +} + +;CHECK: _inreg8xdouble +;CHECK: vbroadcastsdz {{.*}}, %zmm +;CHECK: ret +define <8 x double> @_inreg8xdouble(double %a) { + %b = insertelement <8 x double> undef, double %a, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer + ret <8 x double> %c +}