diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 0bc10a1f603..fbcc34ee655 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -639,6 +639,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM) setTargetDAGCombine(ISD::BRCOND); setTargetDAGCombine(ISD::BSWAP); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); @@ -8301,6 +8303,105 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, N->getOperand(0), ShiftCst), ShiftCst); } +// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for +// builtins) into loads with swaps. +SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Chain; + SDValue Base; + MachineMemOperand *MMO; + + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode for little endian VSX load"); + case ISD::LOAD: { + LoadSDNode *LD = cast(N); + Chain = LD->getChain(); + Base = LD->getBasePtr(); + MMO = LD->getMemOperand(); + // If the MMO suggests this isn't a load of a full vector, leave + // things alone. For a built-in, we have to make the change for + // correctness, so if there is a size problem that will be a bug. + if (MMO->getSize() < 16) + return SDValue(); + break; + } + case ISD::INTRINSIC_W_CHAIN: { + MemIntrinsicSDNode *Intrin = cast(N); + Chain = Intrin->getChain(); + Base = Intrin->getBasePtr(); + MMO = Intrin->getMemOperand(); + break; + } + } + + MVT VecTy = N->getValueType(0).getSimpleVT(); + SDValue LoadOps[] = { Chain, Base }; + SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, + DAG.getVTList(VecTy, MVT::Other), + LoadOps, VecTy, MMO); + DCI.AddToWorklist(Load.getNode()); + Chain = Load.getValue(1); + SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, + DAG.getVTList(VecTy, MVT::Other), Chain, Load); + DCI.AddToWorklist(Swap.getNode()); + return Swap; +} + +// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for +// builtins) into stores with swaps. +SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Chain; + SDValue Base; + unsigned SrcOpnd; + MachineMemOperand *MMO; + + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode for little endian VSX store"); + case ISD::STORE: { + StoreSDNode *ST = cast(N); + Chain = ST->getChain(); + Base = ST->getBasePtr(); + MMO = ST->getMemOperand(); + SrcOpnd = 1; + // If the MMO suggests this isn't a store of a full vector, leave + // things alone. For a built-in, we have to make the change for + // correctness, so if there is a size problem that will be a bug. + if (MMO->getSize() < 16) + return SDValue(); + break; + } + case ISD::INTRINSIC_VOID: { + MemIntrinsicSDNode *Intrin = cast(N); + Chain = Intrin->getChain(); + // Intrin->getBasePtr() oddly does not get what we want. + Base = Intrin->getOperand(3); + MMO = Intrin->getMemOperand(); + SrcOpnd = 2; + break; + } + } + + SDValue Src = N->getOperand(SrcOpnd); + MVT VecTy = Src.getValueType().getSimpleVT(); + SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, + DAG.getVTList(VecTy, MVT::Other), Chain, Src); + DCI.AddToWorklist(Swap.getNode()); + Chain = Swap.getValue(1); + SDValue StoreOps[] = { Chain, Swap, Base }; + SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, + DAG.getVTList(MVT::Other), + StoreOps, VecTy, MMO); + DCI.AddToWorklist(Store.getNode()); + return Store; +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { const TargetMachine &TM = getTargetMachine(); @@ -8366,7 +8467,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } } break; - case ISD::STORE: + case ISD::STORE: { // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). if (TM.getSubtarget().hasSTFIWX() && !cast(N)->isTruncatingStore() && @@ -8417,10 +8518,33 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, Ops, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); } + + // For little endian, VSX stores require generating xxswapd/lxvd2x. + EVT VT = N->getOperand(1).getValueType(); + if (VT.isSimple()) { + MVT StoreVT = VT.getSimpleVT(); + if (TM.getSubtarget().hasVSX() && + TM.getSubtarget().isLittleEndian() && + (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || + StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) + return expandVSXStoreForLE(N, DCI); + } break; + } case ISD::LOAD: { LoadSDNode *LD = cast(N); EVT VT = LD->getValueType(0); + + // For little endian, VSX loads require generating lxvd2x/xxswapd. + if (VT.isSimple()) { + MVT LoadVT = VT.getSimpleVT(); + if (TM.getSubtarget().hasVSX() && + TM.getSubtarget().isLittleEndian() && + (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || + LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) + return expandVSXLoadForLE(N, DCI); + } + Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); if (ISD::isNON_EXTLoad(N) && VT.isVector() && @@ -8569,6 +8693,34 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } break; + case ISD::INTRINSIC_W_CHAIN: { + // For little endian, VSX loads require generating lxvd2x/xxswapd. + if (TM.getSubtarget().hasVSX() && + TM.getSubtarget().isLittleEndian()) { + switch (cast(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: + return expandVSXLoadForLE(N, DCI); + } + } + break; + } + case ISD::INTRINSIC_VOID: { + // For little endian, VSX stores require generating xxswapd/stxvd2x. + if (TM.getSubtarget().hasVSX() && + TM.getSubtarget().isLittleEndian()) { + switch (cast(N->getOperand(1))->getZExtValue()) { + default: + break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: + return expandVSXStoreForLE(N, DCI); + } + } + break; + } case ISD::BSWAP: // Turn BSWAP (LOAD) -> lhbrx/lwbrx. if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index e9a71073f91..9842f45613d 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -254,6 +254,13 @@ namespace llvm { /// operand identifies the operating system entry point. SC, + /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little + /// endian. Maps to an xxswapd instruction that corrects an lxvd2x + /// or stxvd2x instruction. The chain is necessary because the + /// sequence replaces a load and needs to provide the same number + /// of outputs. + XXSWAPD, + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of /// the GPRC input, then stores it through Ptr. Type can be either i16 or @@ -293,7 +300,17 @@ namespace llvm { /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l. /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset. - ADDI_TOC_L + ADDI_TOC_L, + + /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian. + /// Maps directly to an lxvd2x instruction that will be followed by + /// an xxswapd. + LXVD2X, + + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. + /// Maps directly to an stxvd2x instruction that will be preceded by + /// an xxswapd. + STXVD2X }; } @@ -403,6 +420,9 @@ namespace llvm { void ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const override; + SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; unsigned getRegisterByName(const char* RegName, EVT VT) const override; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 6aff437a11d..d8003c9d8c8 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -25,6 +25,23 @@ def vsfrc : RegisterOperand { let ParserMatchClass = PPCRegVSFRCAsmOperand; } +// Little-endian-specific nodes. +def SDT_PPClxvd2x : SDTypeProfile<1, 1, [ + SDTCisVT<0, v2f64>, SDTCisPtrTy<1> +]>; +def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [ + SDTCisVT<0, v2f64>, SDTCisPtrTy<1> +]>; +def SDT_PPCxxswapd : SDTypeProfile<1, 1, [ + SDTCisSameAs<0, 1> +]>; + +def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, + [SDNPHasChain, SDNPMayLoad]>; +def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x, + [SDNPHasChain, SDNPMayStore]>; +def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>; + multiclass XX3Form_Rcr opcode, bits<7> xo, dag OOL, dag IOL, string asmbase, string asmstr, InstrItinClass itin, list pattern> { @@ -40,6 +57,9 @@ multiclass XX3Form_Rcr opcode, bits<7> xo, dag OOL, dag IOL, } def HasVSX : Predicate<"PPCSubTarget->hasVSX()">; +def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">; +def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">; + let Predicates = [HasVSX] in { let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. let hasSideEffects = 0 in { // VSX instructions don't have side effects. @@ -854,11 +874,19 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))), def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>; +def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>; // Stores. def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; +def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + +// Permutes. +def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>; +def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>; +def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>; +def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>; // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll index 9eed623baca..4edd8d59e52 100644 --- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll +++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s +; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec -mattr=-vsx | FileCheck %s + +; Currently VSX support is disabled for this test because we generate lxsdx +; instead of lfd, and stxsdx instead of stfd. That is a poor choice when we +; have reg+imm addressing, and is on the list of things to be fixed. target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux-gnu" diff --git a/test/CodeGen/PowerPC/ppcf128-endian.ll b/test/CodeGen/PowerPC/ppcf128-endian.ll index 2a5f13a5c3d..180fedf5c9f 100644 --- a/test/CodeGen/PowerPC/ppcf128-endian.ll +++ b/test/CodeGen/PowerPC/ppcf128-endian.ll @@ -1,4 +1,4 @@ -; RUN: llc -mcpu=pwr7 -mattr=+altivec < %s | FileCheck %s +; RUN: llc -mcpu=pwr7 -mattr=+altivec -mattr=-vsx < %s | FileCheck %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-unknown-linux-gnu" diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll index 73a4a4d395d..49f11e4e260 100644 --- a/test/CodeGen/PowerPC/vec_misaligned.ll +++ b/test/CodeGen/PowerPC/vec_misaligned.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s -check-prefix=CHECK-LE target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128" target triple = "powerpc-apple-darwin8" diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll index a4b2119f6eb..c7fc1c60c5e 100644 --- a/test/CodeGen/PowerPC/vec_shuffle_le.ll +++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mcpu=pwr7 | FileCheck %s define void @VPKUHUM_xy(<16 x i8>* %A, <16 x i8>* %B) { entry: diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll index cf593b779b7..b9d23d9c270 100644 --- a/test/CodeGen/PowerPC/vsx-ldst.ll +++ b/test/CodeGen/PowerPC/vsx-ldst.ll @@ -9,6 +9,14 @@ ; RUN: grep stxvw4x < %t | count 3 ; RUN: grep stxvd2x < %t | count 3 +;; Note: The LE test variant is disabled until LE support for VSX is enabled, +;; as otherwise we fail to get the expected counts. + +; R;UN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t +; R;UN: grep lxvd2x < %t | count 6 +; R;UN: grep stxvd2x < %t | count 6 +; R;UN: grep xxpermdi < %t | count 12 + @vsi = global <4 x i32> , align 16 @vui = global <4 x i32> , align 16 @vf = global <4 x float> , align 16