diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 94f7d6920ab..c32c6855a5c 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -832,6 +833,88 @@ SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, } } +static bool offsetsDoNotOverlap(int WidthA, int OffsetA, + int WidthB, int OffsetB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + return LowOffset + LowWidth <= HighOffset; +} + +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const { + unsigned BaseReg0, Offset0; + unsigned BaseReg1, Offset1; + + if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && + getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && + "read2 / write2 not expected here yet"); + unsigned Width0 = (*MIa->memoperands_begin())->getSize(); + unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + if (BaseReg0 == BaseReg1 && + offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + return true; + } + } + + return false; +} + +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + unsigned Opc0 = MIa->getOpcode(); + unsigned Opc1 = MIb->getOpcode(); + + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must load from or modify a memory location"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must load from or modify a memory location"); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + return false; + + // XXX - Can we relax this between address spaces? + if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // TODO: Should we check the address space from the MachineMemOperand? That + // would allow us to distinguish objects we know don't alias based on the + // underlying addres space, even if it was lowered to a different one, + // e.g. private accesses lowered to use MUBUF instructions on a scratch + // buffer. + if (isDS(Opc0)) { + if (isDS(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1); + } + + if (isMUBUF(Opc0) || isMTBUF(Opc0)) { + if (isMUBUF(Opc1) || isMTBUF(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isSMRD(Opc1); + } + + if (isSMRD(Opc0)) { + if (isSMRD(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + } + + if (isFLAT(Opc0)) { + if (isFLAT(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return false; + } + + return false; +} + namespace llvm { namespace AMDGPU { // Helper function generated by tablegen. We are wrapping this with diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index ce32fd7fa65..3bdbc9b5498 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -57,6 +57,9 @@ private: void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const; + unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; public: @@ -115,6 +118,10 @@ public: bool isTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA = nullptr) const; + bool areMemAccessesTriviallyDisjoint( + MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, unsigned SrcReg) const override; @@ -131,6 +138,7 @@ public: bool isVOP2(uint16_t Opcode) const; bool isVOP3(uint16_t Opcode) const; bool isVOPC(uint16_t Opcode) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll new file mode 100644 index 00000000000..2c146eb288d --- /dev/null +++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll @@ -0,0 +1,238 @@ +; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.AMDGPU.barrier.local() #2 + + +@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 +@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 +@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 + +; FUNC-LABEL: @reorder_local_load_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; CI: buffer_store_dword +define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32 addrspace(3)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI: buffer_store_dword +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32 addrspace(3)* %ptr1, align 4 + store volatile i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI: buffer_store_dword +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32 addrspace(3)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + call void @llvm.AMDGPU.barrier.local() #2 + %tmp2 = load i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; Technically we could reorder these, but just comparing the +; instruction type of the load is insufficient. + +; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load +; CI: buffer_load_dword +; CI: buffer_store_dword +; CI: buffer_load_dword +; CI: buffer_store_dword +define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; XXX: Should be able to reorder this, but the laods count as ordered + +; FUNC-LABEL: @reorder_constant_load_local_store_constant_load +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_load_dword +; CI: buffer_store_dword +define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { + %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load +; CI: s_load_dword +; CI: s_load_dword +; CI: s_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_load_local_store_global_load +; CI: buffer_load_dword +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 2 + + %tmp1 = load i32 addrspace(1)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32 addrspace(1)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_local_offsets +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 101 + + store i32 123, i32 addrspace(3)* %ptr1, align 4 + %tmp1 = load i32 addrspace(3)* %ptr2, align 4 + %tmp2 = load i32 addrspace(3)* %ptr3, align 4 + store i32 123, i32 addrspace(3)* %ptr2, align 4 + %tmp3 = load i32 addrspace(3)* %ptr1, align 4 + store i32 789, i32 addrspace(3)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_offsets +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0xc +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 101 + + store i32 123, i32 addrspace(1)* %ptr1, align 4 + %tmp1 = load i32 addrspace(1)* %ptr2, align 4 + %tmp2 = load i32 addrspace(1)* %ptr3, align 4 + store i32 123, i32 addrspace(1)* %ptr2, align 4 + %tmp3 = load i32 addrspace(1)* %ptr1, align 4 + store i32 789, i32 addrspace(1)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load +; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 +; XCI: TBUFFER_STORE_FORMAT +; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 +; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 { +; %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + +; %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1 +; %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2 + +; %tmp1 = load i32 addrspace(3)* %ptr1, align 4 + +; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 +; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, +; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, +; i32 1, i32 0) + +; %tmp2 = load i32 addrspace(3)* %ptr2, align 4 + +; %add = add nsw i32 %tmp1, %tmp2 + +; store i32 %add, i32 addrspace(1)* %out, align 4 +; ret void +; } + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind noduplicate }