R600: Add support for global vector stores with elements less than 32-bits

Tested-by: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188520 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Tom Stellard 2013-08-16 01:12:11 +00:00
parent ec484277dd
commit 4c52d450dc
4 changed files with 134 additions and 1 deletions

View File

@ -67,6 +67,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::f64, Promote); setOperationAction(ISD::STORE, MVT::f64, Promote);
AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
// XXX: This can be change to Custom, once ExpandVectorStores can
// handle 64-bit stores.
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
setOperationAction(ISD::LOAD, MVT::f32, Promote); setOperationAction(ISD::LOAD, MVT::f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
@ -187,6 +194,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE: return LowerVectorStore(Op, DAG);
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
} }
return Op; return Op;
@ -487,6 +495,59 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
return DAG.getMergeValues(Ops, 2, DL); return DAG.getMergeValues(Ops, 2, DL);
} }
SDValue AMDGPUTargetLowering::LowerVectorStore(const SDValue &Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
EVT MemVT = Store->getMemoryVT();
unsigned MemBits = MemVT.getSizeInBits();
// Byte stores are really expensive, so if possible, try to pack
// 32-bit vector truncatating store into an i32 store.
// XXX: We could also handle optimize other vector bitwidths
if (!MemVT.isVector() || MemBits > 32) {
return SDValue();
}
SDLoc DL(Op);
const SDValue &Value = Store->getValue();
EVT VT = Value.getValueType();
const SDValue &Ptr = Store->getBasePtr();
EVT MemEltVT = MemVT.getVectorElementType();
unsigned MemEltBits = MemEltVT.getSizeInBits();
unsigned MemNumElements = MemVT.getVectorNumElements();
EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
SDValue Mask;
switch(MemEltBits) {
case 8:
Mask = DAG.getConstant(0xFF, PackedVT);
break;
case 16:
Mask = DAG.getConstant(0xFFFF, PackedVT);
break;
default:
llvm_unreachable("Cannot lower this vector store");
}
SDValue PackedValue;
for (unsigned i = 0; i < MemNumElements; ++i) {
EVT ElemVT = VT.getVectorElementType();
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
DAG.getConstant(i, MVT::i32));
Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
if (i == 0) {
PackedValue = Elt;
} else {
PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
}
}
return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
MachinePointerInfo(Store->getMemOperand()->getValue()),
Store->isVolatile(), Store->isNonTemporal(),
Store->getAlignment());
}
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// Helper functions // Helper functions
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

View File

@ -51,6 +51,10 @@ protected:
void AnalyzeFormalArguments(CCState &State, void AnalyzeFormalArguments(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const; const SmallVectorImpl<ISD::InputArg> &Ins) const;
/// \brief Lower vector stores by merging the vector elements into an integer
/// of the same bitwidth.
SDValue LowerVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
public: public:
AMDGPUTargetLowering(TargetMachine &TM); AMDGPUTargetLowering(TargetMachine &TM);

View File

@ -1011,10 +1011,15 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDValue Value = Op.getOperand(1); SDValue Value = Op.getOperand(1);
SDValue Ptr = Op.getOperand(2); SDValue Ptr = Op.getOperand(2);
SDValue Result = AMDGPUTargetLowering::LowerVectorStore(Op, DAG);
if (Result.getNode()) {
return Result;
}
if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
if (StoreNode->isTruncatingStore()) { if (StoreNode->isTruncatingStore()) {
EVT VT = Value.getValueType(); EVT VT = Value.getValueType();
assert(VT == MVT::i32); assert(VT.bitsLE(MVT::i32));
EVT MemVT = StoreNode->getMemoryVT(); EVT MemVT = StoreNode->getMemoryVT();
SDValue MaskConstant; SDValue MaskConstant;
if (MemVT == MVT::i8) { if (MemVT == MVT::i8) {
@ -1571,6 +1576,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
} }
} }
} }
case AMDGPUISD::EXPORT: { case AMDGPUISD::EXPORT: {
SDValue Arg = N->getOperand(1); SDValue Arg = N->getOperand(1);
if (Arg.getOpcode() != ISD::BUILD_VECTOR) if (Arg.getOpcode() != ISD::BUILD_VECTOR)

View File

@ -63,6 +63,49 @@ entry:
ret void ret void
} }
; EG-CHECK: @store_v2i8
; EG-CHECK: MEM_RAT MSKOR
; EG-CHECK-NOT: MEM_RAT MSKOR
; SI-CHECK: @store_v2i8
; SI-CHECK: BUFFER_STORE_BYTE
; SI-CHECK: BUFFER_STORE_BYTE
define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
entry:
%0 = trunc <2 x i32> %in to <2 x i8>
store <2 x i8> %0, <2 x i8> addrspace(1)* %out
ret void
}
; EG-CHECK: @store_v2i16
; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
; CM-CHECK: @store_v2i16
; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
; SI-CHECK: @store_v2i16
; SI-CHECK: BUFFER_STORE_DWORD
define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
entry:
%0 = trunc <2 x i32> %in to <2 x i16>
store <2 x i16> %0, <2 x i16> addrspace(1)* %out
ret void
}
; EG-CHECK: @store_v4i8
; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
; CM-CHECK: @store_v4i8
; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
; SI-CHECK: @store_v4i8
; SI-CHECK: BUFFER_STORE_BYTE
; SI-CHECK: BUFFER_STORE_BYTE
; SI-CHECK: BUFFER_STORE_BYTE
; SI-CHECK: BUFFER_STORE_BYTE
define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, <4 x i8> addrspace(1)* %out
ret void
}
; floating-point store ; floating-point store
; EG-CHECK: @store_f32 ; EG-CHECK: @store_f32
; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
@ -76,6 +119,25 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
ret void ret void
} }
; EG-CHECK: @store_v4i16
; EG-CHECK: MEM_RAT MSKOR
; EG-CHECK: MEM_RAT MSKOR
; EG-CHECK: MEM_RAT MSKOR
; EG-CHECK: MEM_RAT MSKOR
; EG-CHECK-NOT: MEM_RAT MSKOR
; SI-CHECK: @store_v4i16
; SI-CHECK: BUFFER_STORE_SHORT
; SI-CHECK: BUFFER_STORE_SHORT
; SI-CHECK: BUFFER_STORE_SHORT
; SI-CHECK: BUFFER_STORE_SHORT
; SI-CHECK-NOT: BUFFER_STORE_BYTE
define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
entry:
%0 = trunc <4 x i32> %in to <4 x i16>
store <4 x i16> %0, <4 x i16> addrspace(1)* %out
ret void
}
; vec2 floating-point stores ; vec2 floating-point stores
; EG-CHECK: @store_v2f32 ; EG-CHECK: @store_v2f32
; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW