From 7091b2451d46ae7e85188d1c5d7a43d775346ee0 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Tue, 14 Aug 2012 21:24:47 +0000 Subject: [PATCH] fix PR11334 - FP_EXTEND only support extending from vectors with matching elements. This results in the scalarization of extending to v2f64 from v2f32, which will be legalized to v4f32 not matching with v2f64. - add X86-specific VFPEXT supproting extending from v4f32 to v2f64. - add BUILD_VECTOR lowering helper to recover back the original extending from v4f32 to v2f64. - test case is enhanced to include different vector width. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@161894 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 81 +++++++++++++++++++++++++ lib/Target/X86/X86ISelLowering.h | 5 ++ lib/Target/X86/X86InstrFragmentsSIMD.td | 5 ++ lib/Target/X86/X86InstrSSE.td | 8 +++ test/CodeGen/X86/pr11334.ll | 56 +++++++++++++++++ 5 files changed, 155 insertions(+) create mode 100644 test/CodeGen/X86/pr11334.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ea66a6115d7..648637229f6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5114,6 +5114,82 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { return SDValue(); } +// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64 +// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the +// constraint of matching input/output vector elements. +SDValue +X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + SDNode *N = Op.getNode(); + EVT VT = Op.getValueType(); + unsigned NumElts = Op.getNumOperands(); + + // Check supported types and sub-targets. + // + // Only v2f32 -> v2f64 needs special handling. + if (VT != MVT::v2f64 || !Subtarget->hasSSE2()) + return SDValue(); + + SDValue VecIn; + EVT VecInVT; + SmallVector Mask; + EVT SrcVT = MVT::Other; + + // Check the patterns could be translated into X86vfpext. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue In = N->getOperand(i); + unsigned Opcode = In.getOpcode(); + + // Skip if the element is undefined. + if (Opcode == ISD::UNDEF) { + Mask.push_back(-1); + continue; + } + + // Quit if one of the elements is not defined from 'fpext'. + if (Opcode != ISD::FP_EXTEND) + return SDValue(); + + // Check how the source of 'fpext' is defined. + SDValue L2In = In.getOperand(0); + EVT L2InVT = L2In.getValueType(); + + // Check the original type + if (SrcVT == MVT::Other) + SrcVT = L2InVT; + else if (SrcVT != L2InVT) // Quit if non-homogenous typed. + return SDValue(); + + // Check whether the value being 'fpext'ed is extracted from the same + // source. + Opcode = L2In.getOpcode(); + + // Quit if it's not extracted with a constant index. + if (Opcode != ISD::EXTRACT_VECTOR_ELT || + !isa(L2In.getOperand(1))) + return SDValue(); + + SDValue ExtractedFromVec = L2In.getOperand(0); + + if (VecIn.getNode() == 0) { + VecIn = ExtractedFromVec; + VecInVT = ExtractedFromVec.getValueType(); + } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec. + return SDValue(); + + Mask.push_back(cast(L2In.getOperand(1))->getZExtValue()); + } + + // Fill the remaining mask as undef. + for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i) + Mask.push_back(-1); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getVectorShuffle(VecInVT, DL, + VecIn, DAG.getUNDEF(VecInVT), + &Mask[0])); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -5146,6 +5222,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (Broadcast.getNode()) return Broadcast; + SDValue FpExt = LowerVectorFpExtend(Op, DAG); + if (FpExt.getNode()) + return FpExt; + unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -11343,6 +11423,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 9123ebd8ae4..c8a04c02d07 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -227,6 +227,9 @@ namespace llvm { // VSEXT_MOVL - Vector move low and sign extend. VSEXT_MOVL, + // VFPEXT - Vector FP extend. + VFPEXT, + // VSHL, VSRL - 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -828,6 +831,8 @@ namespace llvm { SDValue LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const; SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const; + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index d13167bb05d..1db68c86b76 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -81,6 +81,11 @@ def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def X86vfpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>]>>; + def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e4c35b9bc55..20dc81eb4a3 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2101,12 +2101,20 @@ let Predicates = [HasAVX] in { def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), (VCVTPD2PSYrm addr:$src)>; + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (VCVTPS2PDrr VR128:$src)>; def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), (VCVTPS2PDYrr VR128:$src)>; def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), (VCVTPS2PDYrm addr:$src)>; } +let Predicates = [HasSSE2] in { + // Match fextend for 128 conversions + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (CVTPS2PDrr VR128:$src)>; +} + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Compare Instructions //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll new file mode 100644 index 00000000000..5b7b5eab87e --- /dev/null +++ b/test/CodeGen/X86/pr11334.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX + +define <2 x double> @v2f2d_ext_vec(<2 x float> %v1) nounwind { +entry: +; CHECK: v2f2d_ext_vec +; CHECK: cvtps2pd +; AVX: v2f2d_ext_vec +; AVX: vcvtps2pd + %f1 = fpext <2 x float> %v1 to <2 x double> + ret <2 x double> %f1 +} + +define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind { +entry: +; CHECK: v3f2d_ext_vec +; CHECK: cvtps2pd +; CHECK: movhlps +; CHECK: cvtps2pd +; AVX: v3f2d_ext_vec +; AVX: vcvtps2pd +; AVX: ret + %f1 = fpext <3 x float> %v1 to <3 x double> + ret <3 x double> %f1 +} + +define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind { +entry: +; CHECK: v4f2d_ext_vec +; CHECK: cvtps2pd +; CHECK: movhlps +; CHECK: cvtps2pd +; AVX: v4f2d_ext_vec +; AVX: vcvtps2pd +; AVX: ret + %f1 = fpext <4 x float> %v1 to <4 x double> + ret <4 x double> %f1 +} + +define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind { +entry: +; CHECK: v8f2d_ext_vec +; CHECK: cvtps2pd +; CHECK: cvtps2pd +; CHECK: movhlps +; CHECK: cvtps2pd +; CHECK: movhlps +; CHECK: cvtps2pd +; AVX: v8f2d_ext_vec +; AVX: vcvtps2pd +; AVX: vextractf128 +; AVX: vcvtps2pd +; AVX: ret + %f1 = fpext <8 x float> %v1 to <8 x double> + ret <8 x double> %f1 +}