From e2d3e4467ee57211d661b3474a6dd7e8a85c8376 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Jun 2015 21:40:28 +0000 Subject: [PATCH] [X86][SSE] Vectorize v2i32 to v2f64 conversions This patch enables support for the conversion of v2i32 to v2f64 to use the CVTDQ2PD xmm instruction and stay on the SSE unit instead of scalarizing, sign extending to i64 and using CVTSI2SDQ scalar conversions. Differential Revision: http://reviews.llvm.org/D10433 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239855 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 15 +++- lib/Target/X86/X86ISelLowering.h | 3 + lib/Target/X86/X86InstrFragmentsSIMD.td | 3 + lib/Target/X86/X86InstrSSE.td | 15 +++- test/CodeGen/X86/vec_int_to_fp.ll | 71 +++---------------- .../X86/x86-setcc-int-to-fp-combine.ll | 8 +-- 6 files changed, 42 insertions(+), 73 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6b76fbc02ed..da3f9c11713 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -915,6 +915,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom @@ -11648,15 +11650,21 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (SrcVT.isVector()) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(SrcVT))); + } if (SrcVT.getVectorElementType() == MVT::i1) { MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, - Op.getOperand(0))); + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); } return SDValue(); } @@ -18498,6 +18506,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index efda861e2eb..7f00c73c961 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -293,6 +293,9 @@ namespace llvm { // Vector FP round. VFPROUND, + // Vector signed integer to double. + CVTDQ2PD, + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index f158d1233d9..889fd39cf0e 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -72,6 +72,9 @@ def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; //def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; +def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", + SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8294e38e995..95629184f2c 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2234,14 +2234,27 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; -// AVX 256-bit register conversion intrinsics +// AVX register conversion intrinsics let Predicates = [HasAVX] in { + def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), + (VCVTDQ2PDrr VR128:$src)>; + def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), + (VCVTDQ2PDrm addr:$src)>; + def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PDYrr VR128:$src)>; def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PDYrm addr:$src)>; } // Predicates = [HasAVX] +// SSE2 register conversion intrinsics +let Predicates = [HasSSE2] in { + def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), + (CVTDQ2PDrr VR128:$src)>; + def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), + (CVTDQ2PDrm addr:$src)>; +} // Predicates = [HasSSE2] + // Convert packed double to packed single // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 6fb1943e888..ad6ac435533 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -34,30 +34,12 @@ define <2 x double> @sitofp_2vf64(<2 x i64> %a) { define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) { ; SSE2-LABEL: sitofp_2vf64_i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: movd %xmm0, %rcx -; SSE2-NEXT: movslq %ecx, %rcx -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_2vf64_i32: ; AVX: # BB#0: -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: cltq -; AVX-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %cvt = sitofp <2 x i32> %shuf to <2 x double> @@ -177,28 +159,10 @@ define <4 x double> @sitofp_4vf64(<4 x i64> %a) { define <4 x double> @sitofp_4vf64_i32(<4 x i32> %a) { ; SSE2-LABEL: sitofp_4vf64_i32: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm1, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_4vf64_i32: @@ -257,28 +221,9 @@ define <4 x double> @sitofp_4vf64_i8(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: cltq -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 ; SSE2-NEXT: retq ; ; AVX-LABEL: sitofp_4vf64_i8: diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll index a2c5b3a6eed..248a9202e99 100644 --- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll +++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -27,12 +27,8 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin ; CHECK-NEXT: .long 1 ## 0x1 ; CHECK-NEXT: .long 1 ## 0x1 ; CHECK-LABEL: foo1: -; FIXME: The operation gets scalarized. If/when the compiler learns to better -; use [V]CVTDQ2PD, this will need updated. -; CHECK: cvtsi2sdq -; CHECK: cvtsi2sdq -; CHECK: cvtsi2sdq -; CHECK: cvtsi2sdq +; CHECK: cvtdq2pd +; CHECK: cvtdq2pd %cmp = fcmp oeq <4 x float> %val, %test %ext = zext <4 x i1> %cmp to <4 x i32> %result = sitofp <4 x i32> %ext to <4 x double>