diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 928c0e3ba6d..1bbfd2b68f3 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -47,6 +47,11 @@ private: break; } + // These will be expanded to two ALU instructions in the + // ExpandSpecialInstructions pass. + if (TII->isLDSRetInstr(MI->getOpcode())) + return 2; + if(TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()) || TII->isReductionOp(MI->getOpcode())) @@ -106,8 +111,13 @@ private: } bool SubstituteKCacheBank(MachineInstr *MI, - std::vector > &CachedConsts) const { + std::vector > &CachedConsts, + bool UpdateInstr = true) const { std::vector > UsedKCache; + + if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + return true; + const SmallVectorImpl > &Consts = TII->getSrcs(MI); assert((TII->isALUInstr(MI->getOpcode()) || @@ -140,6 +150,9 @@ private: return false; } + if (!UpdateInstr) + return true; + for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) continue; @@ -160,6 +173,52 @@ private: return true; } + bool canClauseLocalKillFitInClause( + unsigned AluInstCount, + std::vector > KCacheBanks, + MachineBasicBlock::iterator Def, + MachineBasicBlock::iterator BBEnd) { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + for (MachineInstr::const_mop_iterator + MOI = Def->operands_begin(), + MOE = Def->operands_end(); MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef() || + TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) + continue; + + // Def defines a clause local register, so check that its use will fit + // in the clause. + unsigned LastUseCount = 0; + for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { + AluInstCount += OccupiedDwords(UseI); + // Make sure we won't need to end the clause due to KCache limitations. + if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) + return false; + + // We have reached the maximum instruction limit before finding the + // use that kills this register, so we cannot use this def in the + // current clause. + if (AluInstCount >= TII->getMaxAlusPerClause()) + return false; + + // Register kill flags have been cleared by the time we get to this + // pass, but it is safe to assume that all uses of this register + // occur in the same basic block as its definition, because + // it is illegal for the scheduler to schedule them in + // different blocks. + if (UseI->findRegisterUseOperandIdx(MOI->getReg())) + LastUseCount = AluInstCount; + + if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) + break; + } + if (LastUseCount) + return LastUseCount <= TII->getMaxAlusPerClause(); + llvm_unreachable("Clause local register live at end of clause."); + } + return true; + } + MachineBasicBlock::iterator MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator ClauseHead = I; @@ -198,11 +257,13 @@ private: I++; break; } - if (TII->isALUInstr(I->getOpcode()) && - !SubstituteKCacheBank(I, KCacheBanks)) + + // If this instruction defines a clause local register, make sure + // its use can fit in this clause. + if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) break; - if (I->getOpcode() == AMDGPU::DOT_4 && - !SubstituteKCacheBank(I, KCacheBanks)) + + if (!SubstituteKCacheBank(I, KCacheBanks)) break; AluInstCount += OccupiedDwords(I); } diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index 67b42d704f7..aeee4aa8956 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -68,6 +68,23 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; I = llvm::next(I); + // Expand LDS_*_RET instructions + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineOperand &DstOp = MI.getOperand(DstIdx); + MachineInstr *Mov = TII->buildMovInstr(&MBB, I, + DstOp.getReg(), AMDGPU::OQAP); + DstOp.setReg(AMDGPU::OQAP); + int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), + AMDGPU::OpName::pred_sel); + int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), + AMDGPU::OpName::pred_sel); + // Copy the pred_sel bit + Mov->getOperand(MovPredSelIdx).setReg( + MI.getOperand(LDSPredSelIdx).getReg()); + } + switch (MI.getOpcode()) { default: break; // Expand PRED_X to one of the PRED_SET instructions. diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 4b6c2eb5b15..0fcb488672f 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -134,21 +134,17 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( switch (MI->getOpcode()) { default: - if (TII->isLDSInstr(MI->getOpcode()) && - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) { + // Replace LDS_*_RET instruction that don't have any uses with the + // equivalent LDS_*_NORET instruction. + if (TII->isLDSRetInstr(MI->getOpcode())) { int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) { - NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()), - AMDGPU::OQAP); - TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, - MI->getOperand(0).getReg(), - AMDGPU::OQAP); - } else { - NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); - } + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) + return BB; + + NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { NewMI.addOperand(MI->getOperand(i)); } diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index aff11ce1b34..3987b2d436c 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -141,6 +141,14 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::LDS_1A2D)); } +bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; +} + +bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; +} + bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { if (isALUInstr(MI->getOpcode())) return true; diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index e2996c7a78f..b29b91f8d7e 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -65,6 +65,8 @@ namespace llvm { bool isALUInstr(unsigned Opcode) const; bool hasInstrModifiers(unsigned Opcode) const; bool isLDSInstr(unsigned Opcode) const; + bool isLDSNoRetInstr(unsigned Opcode) const; + bool isLDSRetInstr(unsigned Opcode) const; /// \returns true if this \p Opcode represents an ALU instruction or an /// instruction that will be lowered in ExpandSpecialInstrs Pass. diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 6c26d9ece40..da2a4d862e7 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -92,15 +92,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { AllowSwitchFromAlu = true; } - - // We want to scheduled AR defs as soon as possible to make sure they aren't - // put in a different ALU clause from their uses. - if (!SU && !UnscheduledARDefs.empty()) { - SU = UnscheduledARDefs[0]; - UnscheduledARDefs.erase(UnscheduledARDefs.begin()); - NextInstKind = IDAlu; - } - if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { // try to pick ALU @@ -130,15 +121,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { NextInstKind = IDOther; } - // We want to schedule the AR uses as late as possible to make sure that - // the AR defs have been released. - if (!SU && !UnscheduledARUses.empty()) { - SU = UnscheduledARUses[0]; - UnscheduledARUses.erase(UnscheduledARUses.begin()); - NextInstKind = IDAlu; - } - - DEBUG( if (SU) { dbgs() << " ** Pick node **\n"; @@ -217,20 +199,6 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) { int IK = getInstKind(SU); - // Check for AR register defines - for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(), - E = SU->getInstr()->operands_end(); - I != E; ++I) { - if (I->isReg() && I->getReg() == AMDGPU::AR_X) { - if (I->isDef()) { - UnscheduledARDefs.push_back(SU); - } else { - UnscheduledARUses.push_back(SU); - } - return; - } - } - // There is no export clause, we can schedule one as soon as its ready if (IK == IDOther) Available[IDOther].push_back(SU); diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h index 0a6f1204a4d..97c8cdec0aa 100644 --- a/lib/Target/R600/R600MachineScheduler.h +++ b/lib/Target/R600/R600MachineScheduler.h @@ -53,8 +53,6 @@ class R600SchedStrategy : public MachineSchedStrategy { std::vector Available[IDLast], Pending[IDLast]; std::vector AvailableAlus[AluLast]; - std::vector UnscheduledARDefs; - std::vector UnscheduledARUses; std::vector PhysicalRegCopy; InstKind CurInstKind; diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index fbe333d2038..f3bb88b3eef 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -85,3 +85,16 @@ const RegClassWeight &R600RegisterInfo::getRegClassWeight( const TargetRegisterClass *RC) const { return RCW; } + +bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + switch (Reg) { + case AMDGPU::OQAP: + case AMDGPU::OQBP: + case AMDGPU::AR_X: + return false; + default: + return true; + } +} diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index 8833ee77e04..c74c49ecdcd 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -47,6 +47,8 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const; + // \returns true if \p Reg can be defined in one ALU caluse and used in another. + virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const; }; } // End namespace llvm diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll new file mode 100644 index 00000000000..63a4332d3c4 --- /dev/null +++ b/test/CodeGen/R600/lds-output-queue.ll @@ -0,0 +1,99 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s +; +; This test checks that the lds input queue will is empty at the end of +; the ALU clause. + +; CHECK-LABEL: @lds_input_queue +; CHECK: LDS_READ_RET * OQAP +; CHECK-NOT: ALU clause +; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP + +@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4 + +define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { +entry: + %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index + %1 = load i32 addrspace(3)* %0 + call void @llvm.AMDGPU.barrier.local() + + ; This will start a new clause for the vertex fetch + %2 = load i32 addrspace(1)* %in + %3 = add i32 %1, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +declare void @llvm.AMDGPU.barrier.local() + +; The machine scheduler does not do proper alias analysis and assumes that +; loads from global values (Note that a global value is different that a +; value from global memory. A global value is a value that is declared +; outside of a function, it can reside in any address space) alias with +; all other loads. +; +; This is a problem for scheduling the reads from the local data share (lds). +; These reads are implemented using two instructions. The first copies the +; data from lds into the lds output queue, and the second moves the data from +; the input queue into main memory. These two instructions don't have to be +; scheduled one after the other, but they do need to be scheduled in the same +; clause. The aliasing problem mentioned above causes problems when there is a +; load from global memory which immediately follows a load from a global value that +; has been declared in the local memory space: +; +; %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index +; %1 = load i32 addrspace(3)* %0 +; %2 = load i32 addrspace(1)* %in +; +; The instruction selection phase will generate ISA that looks like this: +; %OQAP = LDS_READ_RET +; %vreg0 = MOV %OQAP +; %vreg1 = VTX_READ_32 +; %vreg2 = ADD_INT %vreg1, %vreg0 +; +; The bottom scheduler will schedule the two ALU instructions first: +; +; UNSCHEDULED: +; %OQAP = LDS_READ_RET +; %vreg1 = VTX_READ_32 +; +; SCHEDULED: +; +; vreg0 = MOV %OQAP +; vreg2 = ADD_INT %vreg1, %vreg2 +; +; The lack of proper aliasing results in the local memory read (LDS_READ_RET) +; to consider the global memory read (VTX_READ_32) has a chain dependency, so +; the global memory read will always be scheduled first. This will give us a +; final program which looks like this: +; +; Alu clause: +; %OQAP = LDS_READ_RET +; VTX clause: +; %vreg1 = VTX_READ_32 +; Alu clause: +; vreg0 = MOV %OQAP +; vreg2 = ADD_INT %vreg1, %vreg2 +; +; This is an illegal program because the OQAP def and use know occur in +; different ALU clauses. +; +; This test checks this scenario and makes sure it doesn't result in an +; illegal program. For now, we have fixed this issue by merging the +; LDS_READ_RET and MOV together during instruction selection and then +; expanding them after scheduling. Once the scheduler has better alias +; analysis, we should be able to keep these instructions sparate before +; scheduling. +; +; CHECK-LABEL: @local_global_alias +; CHECK: LDS_READ_RET +; CHECK-NOT: ALU clause +; CHECK MOV * T{{[0-9]\.[XYZW]}}, OQAP +define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 + %1 = load i32 addrspace(3)* %0 + %2 = load i32 addrspace(1)* %in + %3 = add i32 %2, %1 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll index b413fe3a599..e2d840645d0 100644 --- a/test/CodeGen/R600/local-memory-two-objects.ll +++ b/test/CodeGen/R600/local-memory-two-objects.ll @@ -12,9 +12,11 @@ ; SI-CHECK: .long 47180 ; SI-CHECK-NEXT: .long 32768 -; Make sure the lds writes are using different addresses. -; EG-CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]] -; EG-CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]] +; We would like to check the the lds writes are using different +; addresses, but due to variations in the scheduler, we can't do +; this consistently on evergreen GPUs. +; EG-CHECK: LDS_WRITE +; EG-CHECK: LDS_WRITE ; SI-CHECK: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW:[0-9]*]] ; SI-CHECK-NOT: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW]]