diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ff56fd92d6b..4de5e2585ec 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -956,6 +956,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { @@ -1293,6 +1294,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting + // when we have a 256bit-wide blend with immediate. + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -13307,19 +13312,130 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // The algorithm is the following: + // #ifdef __SSE4_1__ + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + // #else + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + // #endif + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // return (float4) lo + fhi; + + SDLoc DL(Op); + SDValue V = Op->getOperand(0); + EVT VecIntVT = V.getValueType(); + bool Is128 = VecIntVT == MVT::v4i32; + EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + unsigned NumElts = VecIntVT.getVectorNumElements(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + assert(NumElts <= 8 && "The size of the constant array must be fixed"); + + // In the #idef/#else code, we have in common: + // - The vector of constants: + // -- 0x4b000000 + // -- 0x53000000 + // - A shift: + // -- v >> 16 + + // Create the splat vector for 0x4b000000. + SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); + SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, + CstLow, CstLow, CstLow, CstLow}; + SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstLowArray[0], NumElts)); + // Create the splat vector for 0x53000000. + SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); + SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, + CstHigh, CstHigh, CstHigh, CstHigh}; + SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstHighArray[0], NumElts)); + + // Create the right shift. + SDValue CstShift = DAG.getConstant(16, MVT::i32); + SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, + CstShift, CstShift, CstShift, CstShift}; + SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstShiftArray[0], NumElts)); + SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); + + SDValue Low, High; + if (Subtarget.hasSSE41()) { + EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + SDValue VecCstLowBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); + SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); + // Low will be bitcasted right away, so do not bother bitcasting back to its + // original type. + Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, + VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + SDValue VecCstHighBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); + SDValue VecShiftBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); + // High will be bitcasted right away, so do not bother bitcasting back to + // its original type. + High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, + VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); + } else { + SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); + SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, + CstMask, CstMask, CstMask); + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); + Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); + + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); + } + + // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). + SDValue CstFAdd = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); + SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, + CstFAdd, CstFAdd, CstFAdd, CstFAdd}; + SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, + makeArrayRef(&CstFAddArray[0], NumElts)); + + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); + SDValue FHigh = + DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); + // return (float4) lo + fhi; + SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); + return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); +} + SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); - assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || - SVT == MVT::v8i8 || SVT == MVT::v8i16) && - "Custom UINT_TO_FP is not supported!"); - - MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + switch (SVT.SimpleTy) { + default: + llvm_unreachable("Custom UINT_TO_FP is not supported!"); + case MVT::v4i8: + case MVT::v4i16: + case MVT::v8i8: + case MVT::v8i16: { + MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + } + case MVT::v4i32: + case MVT::v8i32: + return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + } + llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 531e0353d3c..dc08a3b0e14 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -578,7 +578,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, // There are faster sequences for float conversions. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, @@ -661,6 +661,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; static const TypeConversionCostTblEntry diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll index cecd069cd25..fb16af635f0 100644 --- a/test/Analysis/CostModel/X86/cast.ll +++ b/test/Analysis/CostModel/X86/cast.ll @@ -225,7 +225,9 @@ define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { ; CHECK: cost of 5 {{.*}} uitofp %C1 = uitofp <8 x i16> %c to <8 x float> - ; CHECK: cost of 9 {{.*}} uitofp + ; CHECK-AVX2: cost of 8 {{.*}} uitofp + ; CHECK-AVX512: cost of 8 {{.*}} uitofp + ; CHECK-AVX: cost of 9 {{.*}} uitofp %D1 = uitofp <8 x i32> %d to <8 x float> ret void } diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll index a41a04df84e..27ec268b42a 100644 --- a/test/Analysis/CostModel/X86/uitofp.ll +++ b/test/Analysis/CostModel/X86/uitofp.ll @@ -235,7 +235,7 @@ define <2 x float> @uitofpv2i8v2float(<2 x i8> %a) { define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) { ; SSE2: uitofpv4i8v4float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp %1 = uitofp <4 x i8> %a to <4 x float> ret <4 x float> %1 } @@ -270,7 +270,7 @@ define <2 x float> @uitofpv2i16v2float(<2 x i16> %a) { define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) { ; SSE2: uitofpv4i16v4float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp %1 = uitofp <4 x i16> %a to <4 x float> ret <4 x float> %1 } @@ -305,28 +305,28 @@ define <2 x float> @uitofpv2i32v2float(<2 x i32> %a) { define <4 x float> @uitofpv4i32v4float(<4 x i32> %a) { ; SSE2: uitofpv4i32v4float - ; SSE2: cost of 15 {{.*}} uitofp + ; SSE2: cost of 8 {{.*}} uitofp %1 = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %1 } define <8 x float> @uitofpv8i32v8float(<8 x i32> %a) { ; SSE2: uitofpv8i32v8float - ; SSE2: cost of 30 {{.*}} uitofp + ; SSE2: cost of 16 {{.*}} uitofp %1 = uitofp <8 x i32> %a to <8 x float> ret <8 x float> %1 } define <16 x float> @uitofpv16i32v16float(<16 x i32> %a) { ; SSE2: uitofpv16i32v16float - ; SSE2: cost of 60 {{.*}} uitofp + ; SSE2: cost of 32 {{.*}} uitofp %1 = uitofp <16 x i32> %a to <16 x float> ret <16 x float> %1 } define <32 x float> @uitofpv32i32v32float(<32 x i32> %a) { ; SSE2: uitofpv32i32v32float - ; SSE2: cost of 120 {{.*}} uitofp + ; SSE2: cost of 64 {{.*}} uitofp %1 = uitofp <32 x i32> %a to <32 x float> ret <32 x float> %1 } diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll index 764c2cdd6d9..e046b966921 100644 --- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm" +; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm" ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s ; rdar://6627786 ; rdar://7792037 diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll index ee20f1fcbd0..607be9a8549 100644 --- a/test/CodeGen/X86/vec_uint_to_fp.ll +++ b/test/CodeGen/X86/vec_uint_to_fp.ll @@ -1,11 +1,156 @@ -; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s + +; Check that the constant used in the vectors are the right ones. +; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]: +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff + +; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 + +; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 + +; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 + +; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 1258291200 ## 0x4b000000 + +; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 1392508928 ## 0x53000000 + +; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11 -; Test that we are not lowering uinttofp to scalars define <4 x float> @test1(<4 x i32> %A) nounwind { ; CHECK-LABEL: test1: -; CHECK-NOT: cvtsd2ss -; CHECK: ret +; +; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] +; SSE-NEXT: pand %xmm0, [[MASK]] +; After this instruction, MASK will have the value of the low parts +; of the vector. +; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 +; SSE-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0 +; SSE-NEXT: addps [[MASK]], %xmm0 +; SSE-NEXT: retq +; +; Currently we commute the arguments of the first blend, but this could be +; improved to match the lowering of the second blend. +; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] +; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 +; SSE41-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0 +; SSE41-NEXT: addps [[LOWVEC]], %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] +; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] +; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] +; AVX-NEXT: vaddps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] +; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 +; AVX-NEXT: retq +; +; The lowering for AVX2 is a bit messy, because we select broadcast +; instructions, instead of folding the constant loads. +; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] +; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] +; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] +; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] +; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] +; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 +; AVX2-NEXT: retq %C = uitofp <4 x i32> %A to <4 x float> ret <4 x float> %C } +; Match the AVX2 constants used in the next function +; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 1258291200 ## 0x4b000000 + +; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 1392508928 ## 0x53000000 + +; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11 + +define <8 x float> @test2(<8 x i32> %A) nounwind { +; CHECK-LABEL: test2: +; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX. +; The constant used for in the vector instruction are shared between the +; two sequences of instructions. +; +; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] +; SSE-NEXT: pand %[[MASK]], [[VECLOW]] +; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] +; SSE-NEXT: por %[[LOWCST]], [[VECLOW]] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] +; SSE-NEXT: por %[[HIGHCST]], %xmm0 +; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] +; SSE-NEXT: addps %[[MAGICCST]], %xmm0 +; SSE-NEXT: addps [[VECLOW]], %xmm0 +; MASK is the low vector of the second part after this point. +; SSE-NEXT: pand %xmm1, %[[MASK]] +; SSE-NEXT: por %[[LOWCST]], %[[MASK]] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %[[HIGHCST]], %xmm1 +; SSE-NEXT: addps %[[MAGICCST]], %xmm1 +; SSE-NEXT: addps %[[MASK]], %xmm1 +; SSE-NEXT: retq +; +; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] +; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 +; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] +; SSE41-NEXT: addps %[[MAGICCST]], %xmm0 +; SSE41-NEXT: addps [[VECLOW]], %xmm0 +; LOWCST is the low vector of the second part after this point. +; The operands of the blend are inverted because we reuse xmm1 +; in the next shift. +; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 +; SSE41-NEXT: addps %[[MAGICCST]], %xmm1 +; SSE41-NEXT: addps %[[LOWCST]], %xmm1 +; SSE41-NEXT: retq +; +; Test that we are not lowering uinttofp to scalars +; AVX-NOT: cvtsd2ss +; AVX: retq +; +; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] +; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] +; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] +; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] +; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] +; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 +; AVX2-NEXT: retq + %C = uitofp <8 x i32> %A to <8 x float> + ret <8 x float> %C +}