mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-12 17:32:19 +00:00
R600/SI: Implement areLoadsFromSameBasePtr
This currently has a noticable effect on the kernel argument loads. LDS and global loads are more problematic, I think because of how copies are currently inserted to ensure that the address is a VGPR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214942 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
5200d84bdc
commit
c9c70b1651
@ -32,6 +32,104 @@ SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
|
||||
// TargetInstrInfo callbacks
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static unsigned getNumOperandsNoGlue(SDNode *Node) {
|
||||
unsigned N = Node->getNumOperands();
|
||||
while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
|
||||
--N;
|
||||
return N;
|
||||
}
|
||||
|
||||
static SDValue findChainOperand(SDNode *Load) {
|
||||
SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
|
||||
assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
|
||||
return LastOp;
|
||||
}
|
||||
|
||||
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
|
||||
int64_t &Offset0,
|
||||
int64_t &Offset1) const {
|
||||
if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
|
||||
return false;
|
||||
|
||||
unsigned Opc0 = Load0->getMachineOpcode();
|
||||
unsigned Opc1 = Load1->getMachineOpcode();
|
||||
|
||||
// Make sure both are actually loads.
|
||||
if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
|
||||
return false;
|
||||
|
||||
if (isDS(Opc0) && isDS(Opc1)) {
|
||||
assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
|
||||
|
||||
// TODO: Also shouldn't see read2st
|
||||
assert(Opc0 != AMDGPU::DS_READ2_B32 &&
|
||||
Opc0 != AMDGPU::DS_READ2_B64 &&
|
||||
Opc1 != AMDGPU::DS_READ2_B32 &&
|
||||
Opc1 != AMDGPU::DS_READ2_B64);
|
||||
|
||||
// Check base reg.
|
||||
if (Load0->getOperand(1) != Load1->getOperand(1))
|
||||
return false;
|
||||
|
||||
// Check chain.
|
||||
if (findChainOperand(Load0) != findChainOperand(Load1))
|
||||
return false;
|
||||
|
||||
Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
|
||||
Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isSMRD(Opc0) && isSMRD(Opc1)) {
|
||||
assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
|
||||
|
||||
// Check base reg.
|
||||
if (Load0->getOperand(0) != Load1->getOperand(0))
|
||||
return false;
|
||||
|
||||
// Check chain.
|
||||
if (findChainOperand(Load0) != findChainOperand(Load1))
|
||||
return false;
|
||||
|
||||
Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue();
|
||||
Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue();
|
||||
return true;
|
||||
}
|
||||
|
||||
// MUBUF and MTBUF can access the same addresses.
|
||||
if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
|
||||
// Skip if an SGPR offset is applied. I don't think we ever emit any of
|
||||
// variants that use this currently.
|
||||
int SoffsetIdx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::soffset);
|
||||
if (SoffsetIdx != -1)
|
||||
return false;
|
||||
|
||||
// getNamedOperandIdx returns the index for the MachineInstr's operands,
|
||||
// which includes the result as the first operand. We are indexing into the
|
||||
// MachineSDNode's operands, so we need to skip the result operand to get
|
||||
// the real index.
|
||||
--SoffsetIdx;
|
||||
|
||||
// Check chain.
|
||||
if (findChainOperand(Load0) != findChainOperand(Load1))
|
||||
return false;
|
||||
|
||||
// MUBUF and MTBUF have vaddr at different indices.
|
||||
int VaddrIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::vaddr) - 1;
|
||||
int VaddrIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::vaddr) - 1;
|
||||
if (Load0->getOperand(VaddrIdx0) != Load1->getOperand(VaddrIdx1))
|
||||
return false;
|
||||
|
||||
int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset) - 1;
|
||||
int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset) - 1;
|
||||
Offset0 = cast<ConstantSDNode>(Load0->getOperand(OffIdx0))->getZExtValue();
|
||||
Offset1 = cast<ConstantSDNode>(Load1->getOperand(OffIdx1))->getZExtValue();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt,
|
||||
unsigned &BaseReg, unsigned &Offset,
|
||||
const TargetRegisterInfo *TRI) const {
|
||||
|
@ -62,6 +62,10 @@ public:
|
||||
return RI;
|
||||
}
|
||||
|
||||
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
|
||||
int64_t &Offset1,
|
||||
int64_t &Offset2) const override;
|
||||
|
||||
bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
|
||||
unsigned &BaseReg, unsigned &Offset,
|
||||
const TargetRegisterInfo *TRI) const final;
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; Test that codegenprepare understands address space sizes
|
||||
|
||||
@ -10,8 +10,8 @@
|
||||
; CHECK-LABEL: @do_as_ptr_calcs:
|
||||
; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
|
||||
; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
|
||||
; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14
|
||||
; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc
|
||||
; CHECK-DAG: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0xc
|
||||
; CHECK-DAG: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0x14
|
||||
define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
|
||||
entry:
|
||||
%x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
|
||||
|
@ -1,8 +1,8 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset:
|
||||
; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7
|
||||
; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
||||
@ -17,8 +17,8 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset:
|
||||
; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI: S_MOV_B64 s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
|
||||
; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
|
||||
; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
|
||||
|
@ -7,7 +7,7 @@ declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
|
||||
declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @s_ctpop_i64:
|
||||
; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
|
||||
; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
|
||||
; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
||||
; SI: BUFFER_STORE_DWORD [[VRESULT]],
|
||||
|
@ -68,12 +68,12 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
|
||||
|
||||
; SI-LABEL: @load_v4i8_to_v4f32_2_uses:
|
||||
; SI: BUFFER_LOAD_UBYTE
|
||||
; SI: V_CVT_F32_UBYTE0_e32
|
||||
; SI: BUFFER_LOAD_UBYTE
|
||||
; SI: BUFFER_LOAD_UBYTE
|
||||
; SI: BUFFER_LOAD_UBYTE
|
||||
; SI: V_CVT_F32_UBYTE0_e32
|
||||
; SI: BUFFER_LOAD_UBYTE
|
||||
; SI: V_CVT_F32_UBYTE0_e32
|
||||
; SI: BUFFER_LOAD_UBYTE
|
||||
; SI: V_CVT_F32_UBYTE0_e32
|
||||
; SI: V_CVT_F32_UBYTE0_e32
|
||||
|
||||
; XXX - replace with this when v4i8 loads aren't scalarized anymore.
|
||||
|
@ -2,9 +2,9 @@
|
||||
|
||||
; FUNC-LABEL: @extract_vector_elt_v2i16
|
||||
; SI: BUFFER_LOAD_USHORT
|
||||
; SI: BUFFER_STORE_SHORT
|
||||
; SI: BUFFER_LOAD_USHORT
|
||||
; SI: BUFFER_STORE_SHORT
|
||||
; SI: BUFFER_STORE_SHORT
|
||||
define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
|
||||
%p0 = extractelement <2 x i16> %foo, i32 0
|
||||
%p1 = extractelement <2 x i16> %foo, i32 1
|
||||
@ -16,9 +16,9 @@ define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) no
|
||||
|
||||
; FUNC-LABEL: @extract_vector_elt_v4i16
|
||||
; SI: BUFFER_LOAD_USHORT
|
||||
; SI: BUFFER_STORE_SHORT
|
||||
; SI: BUFFER_LOAD_USHORT
|
||||
; SI: BUFFER_STORE_SHORT
|
||||
; SI: BUFFER_STORE_SHORT
|
||||
define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
|
||||
%p0 = extractelement <4 x i16> %foo, i32 0
|
||||
%p1 = extractelement <4 x i16> %foo, i32 2
|
||||
|
@ -8,9 +8,9 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
|
||||
|
||||
; Try to identify arg based on higher address.
|
||||
; FUNC-LABEL: @test_copysign_f32:
|
||||
; SI: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb
|
||||
; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc
|
||||
; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
|
||||
; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb
|
||||
; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
|
||||
; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
|
||||
; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
|
||||
|
@ -5,9 +5,9 @@ declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind r
|
||||
declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @test_copysign_f64:
|
||||
; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
|
||||
; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI-DAG: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
|
||||
; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
|
||||
; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
|
||||
; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
|
||||
|
@ -40,37 +40,37 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
|
||||
; SI: DS_WRITE_B8
|
||||
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_READ_U8
|
||||
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_READ_U8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
; SI: DS_WRITE_B8
|
||||
|
||||
; SI: S_ENDPGM
|
||||
@ -100,20 +100,21 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
|
||||
; SI: DS_WRITE_B16
|
||||
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_READ_U16
|
||||
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
; SI: DS_WRITE_B16
|
||||
|
||||
; SI: S_ENDPGM
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
|
||||
; RUN: llc -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
;;;==========================================================================;;;
|
||||
;;; MUBUF LOAD TESTS
|
||||
@ -28,7 +28,7 @@ entry:
|
||||
|
||||
; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
|
||||
; CHECK-LABEL: @mubuf_load2
|
||||
; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80
|
||||
; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80
|
||||
define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
entry:
|
||||
%0 = getelementptr i32 addrspace(1)* %in, i64 1024
|
||||
|
@ -1,9 +1,9 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: @s_rotl_i64:
|
||||
; SI: S_LSHL_B64
|
||||
; SI: S_SUB_I32
|
||||
; SI: S_LSHR_B64
|
||||
; SI: S_LSHL_B64
|
||||
; SI: S_OR_B64
|
||||
define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
|
||||
entry:
|
||||
|
26
test/CodeGen/R600/schedule-global-loads.ll
Normal file
26
test/CodeGen/R600/schedule-global-loads.ll
Normal file
@ -0,0 +1,26 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
|
||||
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #1
|
||||
|
||||
; FIXME: This currently doesn't do a great job of clustering the
|
||||
; loads, which end up with extra moves between them. Right now, it
|
||||
; seems the only things areLoadsFromSameBasePtr is accomplishing is
|
||||
; ordering the loads so that the lower address loads come first.
|
||||
|
||||
; FUNC-LABEL: @cluster_global_arg_loads
|
||||
; SI: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
|
||||
; SI: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
|
||||
; SI: BUFFER_STORE_DWORD [[REG0]]
|
||||
; SI: BUFFER_STORE_DWORD [[REG1]]
|
||||
define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
|
||||
%load0 = load i32 addrspace(1)* %ptr, align 4
|
||||
%gep = getelementptr i32 addrspace(1)* %ptr, i32 1
|
||||
%load1 = load i32 addrspace(1)* %gep, align 4
|
||||
store i32 %load0, i32 addrspace(1)* %out0, align 4
|
||||
store i32 %load1, i32 addrspace(1)* %out1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
12
test/CodeGen/R600/schedule-kernel-arg-loads.ll
Normal file
12
test/CodeGen/R600/schedule-kernel-arg-loads.ll
Normal file
@ -0,0 +1,12 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
|
||||
|
||||
; FUNC-LABEL: @cluster_arg_loads
|
||||
; SI: S_LOAD_DWORDX2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||
; SI-NEXT: S_LOAD_DWORDX2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI-NEXT: S_LOAD_DWORD s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI-NEXT: S_LOAD_DWORD s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
|
||||
define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
|
||||
store i32 %x, i32 addrspace(1)* %out0, align 4
|
||||
store i32 %y, i32 addrspace(1)* %out1, align 4
|
||||
ret void
|
||||
}
|
@ -30,10 +30,11 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
|
||||
}
|
||||
|
||||
; SI-LABEL: @trunc_shl_i64:
|
||||
; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}},
|
||||
; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]],
|
||||
; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
|
||||
; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
|
||||
; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI: S_ADD_I32 s[[LO_SREG2:[0-9]+]], s[[LO_SREG]],
|
||||
; SI: S_ADDC_U32
|
||||
; SI: S_LSHL_B64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG2]]:{{[0-9]+\]}}, 2
|
||||
; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SHL]]
|
||||
; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
|
||||
define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
|
||||
%aa = add i64 %a, 234 ; Prevent shrinking store.
|
||||
|
@ -1,37 +1,45 @@
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
;CHECK-LABEL: @main
|
||||
;CHECK: S_WAITCNT lgkmcnt(0)
|
||||
;CHECK: S_WAITCNT vmcnt(0)
|
||||
;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0)
|
||||
|
||||
define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 {
|
||||
; CHECK-LABEL: @main
|
||||
; CHECK: S_LOAD_DWORDX4
|
||||
; CHECK: S_LOAD_DWORDX4
|
||||
; CHECK: S_WAITCNT lgkmcnt(0)
|
||||
; CHECK: S_WAITCNT vmcnt(0)
|
||||
; CHECK: S_WAITCNT expcnt(0) lgkmcnt(0)
|
||||
define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
|
||||
main_body:
|
||||
%10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0
|
||||
%11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
|
||||
%12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6)
|
||||
%13 = extractelement <4 x float> %12, i32 0
|
||||
%14 = extractelement <4 x float> %12, i32 1
|
||||
%15 = extractelement <4 x float> %12, i32 2
|
||||
%16 = extractelement <4 x float> %12, i32 3
|
||||
%17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1
|
||||
%18 = load <16 x i8> addrspace(2)* %17, !tbaa !0
|
||||
%19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6)
|
||||
%20 = extractelement <4 x float> %19, i32 0
|
||||
%21 = extractelement <4 x float> %19, i32 1
|
||||
%22 = extractelement <4 x float> %19, i32 2
|
||||
%23 = extractelement <4 x float> %19, i32 3
|
||||
call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23)
|
||||
call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16)
|
||||
%tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
|
||||
%tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
|
||||
%tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
|
||||
%tmp12 = extractelement <4 x float> %tmp11, i32 0
|
||||
%tmp13 = extractelement <4 x float> %tmp11, i32 1
|
||||
call void @llvm.AMDGPU.barrier.global() #1
|
||||
%tmp14 = extractelement <4 x float> %tmp11, i32 2
|
||||
; %tmp15 = extractelement <4 x float> %tmp11, i32 3
|
||||
%tmp15 = load float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
|
||||
%tmp16 = getelementptr <16 x i8> addrspace(2)* %arg3, i32 1
|
||||
%tmp17 = load <16 x i8> addrspace(2)* %tmp16, !tbaa !0
|
||||
%tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
|
||||
%tmp19 = extractelement <4 x float> %tmp18, i32 0
|
||||
%tmp20 = extractelement <4 x float> %tmp18, i32 1
|
||||
%tmp21 = extractelement <4 x float> %tmp18, i32 2
|
||||
%tmp22 = extractelement <4 x float> %tmp18, i32 3
|
||||
call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
|
||||
call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
declare void @llvm.AMDGPU.barrier.global() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
|
||||
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
|
||||
|
||||
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
|
||||
|
||||
attributes #0 = { "ShaderType"="1" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #1 = { noduplicate nounwind }
|
||||
attributes #2 = { nounwind readnone }
|
||||
|
||||
!0 = metadata !{metadata !"const", null, i32 1}
|
||||
!0 = metadata !{metadata !1, metadata !1, i64 0, i32 1}
|
||||
!1 = metadata !{metadata !"const", null}
|
||||
|
Loading…
x
Reference in New Issue
Block a user