From 7a0282daeb214f14d75249cc2d90302c44586c4e Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 26 Aug 2013 15:05:44 +0000 Subject: [PATCH] R600: Add support for v4i32 and v2i32 local stores git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189222 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 152 ++++++++++++++++--------- lib/Target/R600/AMDGPUISelLowering.h | 12 +- lib/Target/R600/R600ISelLowering.cpp | 2 +- test/CodeGen/R600/store.ll | 52 +++++++++ 4 files changed, 159 insertions(+), 59 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 56a9a2b5deb..9df835fc874 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -67,6 +67,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::f64, Promote); AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + // Custom lowering of vector stores is required for local address space + // stores. + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + // XXX: Native v2i32 local address space stores are possible, but not + // currently implemented. + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); @@ -221,7 +228,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::STORE: return LowerVectorStore(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); } return Op; @@ -417,7 +424,98 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, return Op; } +SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = dyn_cast(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + // Byte stores are really expensive, so if possible, try to pack + // 32-bit vector truncatating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + const SDValue &Value = Store->getValue(); + EVT VT = Value.getValueType(); + const SDValue &Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + SDValue Mask; + switch(MemEltBits) { + case 8: + Mask = DAG.getConstant(0xFF, PackedVT); + break; + case 16: + Mask = DAG.getConstant(0xFFFF, PackedVT); + break; + default: + llvm_unreachable("Cannot lower this vector store"); + } + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + EVT ElemVT = VT.getVectorElementType(); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); + Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); + SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); + Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); + } + } + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + MachinePointerInfo(Store->getMemOperand()->getValue()), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} + +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast(Op); + EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); + EVT EltVT = Store->getValue().getValueType().getVectorElementType(); + EVT PtrVT = Store->getBasePtr().getValueType(); + unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); + SDLoc SL(Op); + + SmallVector Chains; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Store->getValue(), DAG.getConstant(i, MVT::i32)); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, + Store->getBasePtr(), + DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), + PtrVT)); + Chains.push_back(DAG.getStore(Store->getChain(), SL, Val, Ptr, + MachinePointerInfo(Store->getMemOperand()->getValue()), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment())); + } + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts); +} + +SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + + StoreSDNode *Store = cast(Op); + if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + Store->getValue().getValueType().isVector()) { + return SplitVectorStore(Op, DAG); + } + return SDValue(); +} SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const { @@ -524,58 +622,6 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Ops, 2, DL); } -SDValue AMDGPUTargetLowering::LowerVectorStore(const SDValue &Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = dyn_cast(Op); - EVT MemVT = Store->getMemoryVT(); - unsigned MemBits = MemVT.getSizeInBits(); - - // Byte stores are really expensive, so if possible, try to pack - // 32-bit vector truncatating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths - if (!MemVT.isVector() || MemBits > 32) { - return SDValue(); - } - - SDLoc DL(Op); - const SDValue &Value = Store->getValue(); - EVT VT = Value.getValueType(); - const SDValue &Ptr = Store->getBasePtr(); - EVT MemEltVT = MemVT.getVectorElementType(); - unsigned MemEltBits = MemEltVT.getSizeInBits(); - unsigned MemNumElements = MemVT.getVectorNumElements(); - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); - SDValue Mask; - switch(MemEltBits) { - case 8: - Mask = DAG.getConstant(0xFF, PackedVT); - break; - case 16: - Mask = DAG.getConstant(0xFFFF, PackedVT); - break; - default: - llvm_unreachable("Cannot lower this vector store"); - } - SDValue PackedValue; - for (unsigned i = 0; i < MemNumElements; ++i) { - EVT ElemVT = VT.getVectorElementType(); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, - DAG.getConstant(i, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT); - Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask); - SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT); - Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift); - if (i == 0) { - PackedValue = Elt; - } else { - PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt); - } - } - return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - MachinePointerInfo(Store->getMemOperand()->getValue()), - Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment()); -} //===----------------------------------------------------------------------===// // Helper functions diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index e3a0dcca810..f739aed9783 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -31,6 +31,12 @@ private: SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + /// \brief Lower vector stores by merging the vector elements into an integer + /// of the same bitwidth. + SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; + /// \brief Split a vector store into multiple scalar stores. + /// \returns The resulting chain. + SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; protected: @@ -44,17 +50,13 @@ protected: unsigned Reg, EVT VT) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; - + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl &Ins) const; - /// \brief Lower vector stores by merging the vector elements into an integer - /// of the same bitwidth. - SDValue LowerVectorStore(const SDValue &Op, SelectionDAG &DAG) const; - public: AMDGPUTargetLowering(TargetMachine &TM); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index b822431fab3..e846ff4aee3 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -1002,7 +1002,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue Value = Op.getOperand(1); SDValue Ptr = Op.getOperand(2); - SDValue Result = AMDGPUTargetLowering::LowerVectorStore(Op, DAG); + SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); if (Result.getNode()) { return Result; } diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll index db4e8e9abbb..9c80798eeae 100644 --- a/test/CodeGen/R600/store.ll +++ b/test/CodeGen/R600/store.ll @@ -168,6 +168,58 @@ entry: ret void } +;===------------------------------------------------------------------------===; +; Local Address Space +;===------------------------------------------------------------------------===; + +; EG-CHECK: @store_local_v2i16 +; EG-CHECK: LDS_WRITE +; CM-CHECK: @store_local_v2i16 +; CM-CHECK: LDS_WRITE +; SI-CHECK: @store_local_v2i16 +; SI-CHECK: DS_WRITE_B32 +define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { +entry: + store <2 x i16> %in, <2 x i16> addrspace(3)* %out + ret void +} + +; EG-CHECK: @store_local_v2i32 +; EG-CHECK: LDS_WRITE +; EG-CHECK: LDS_WRITE +; CM-CHECK: @store_local_v2i32 +; CM-CHECK: LDS_WRITE +; CM-CHECK: LDS_WRITE +; SI-CHECK: @store_local_v2i32 +; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_WRITE_B32 +define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { +entry: + store <2 x i32> %in, <2 x i32> addrspace(3)* %out + ret void +} + +; EG-CHECK: @store_local_v4i32 +; EG-CHECK: LDS_WRITE +; EG-CHECK: LDS_WRITE +; EG-CHECK: LDS_WRITE +; EG-CHECK: LDS_WRITE +; CM-CHECK: @store_local_v4i32 +; CM-CHECK: LDS_WRITE +; CM-CHECK: LDS_WRITE +; CM-CHECK: LDS_WRITE +; CM-CHECK: LDS_WRITE +; SI-CHECK: @store_local_v4i32 +; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_WRITE_B32 +define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(3)* %out + ret void +} + ; The stores in this function are combined by the optimizer to create a ; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer ; should not try to split the 64-bit store back into 2 32-bit stores.