From 4ed9917147b1d1f2616f7c941bbe6999b979f510 Mon Sep 17 00:00:00 2001 From: Vincent Lejeune Date: Fri, 17 May 2013 16:50:32 +0000 Subject: [PATCH] R600: Relax some vector constraints on Dot4. Dot4 now uses 8 scalar operands instead of 2 vectors one which allows register coalescer to remove some unneeded COPY. This patch also defines some structures/functions that can be used to handle every vector instructions (CUBE, Cayman special instructions...) in a similar fashion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182126 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.h | 1 + lib/Target/R600/R600Defines.h | 74 +++++++++++++++++ lib/Target/R600/R600EmitClauseMarkers.cpp | 6 +- lib/Target/R600/R600ExpandSpecialInstrs.cpp | 41 +++++++-- lib/Target/R600/R600ISelLowering.cpp | 21 +++++ lib/Target/R600/R600InstrInfo.cpp | 92 ++++++++++++++++++++- lib/Target/R600/R600InstrInfo.h | 5 ++ lib/Target/R600/R600Instructions.td | 59 ++++++++++--- lib/Target/R600/R600MachineScheduler.cpp | 4 +- lib/Target/R600/R600Packetizer.cpp | 3 +- test/CodeGen/R600/pv.ll | 2 +- 11 files changed, 281 insertions(+), 27 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index f108fbc16ee..4c25886632a 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -126,6 +126,7 @@ enum { SMIN, UMIN, URECIP, + DOT4, TEXTURE_FETCH, EXPORT, CONST_ADDRESS, diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 36bfb18e678..aebe58149d4 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -98,6 +98,80 @@ namespace R600Operands { {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18} }; + enum VecOps { + UPDATE_EXEC_MASK_X, + UPDATE_PREDICATE_X, + WRITE_X, + OMOD_X, + DST_REL_X, + CLAMP_X, + SRC0_X, + SRC0_NEG_X, + SRC0_REL_X, + SRC0_ABS_X, + SRC0_SEL_X, + SRC1_X, + SRC1_NEG_X, + SRC1_REL_X, + SRC1_ABS_X, + SRC1_SEL_X, + PRED_SEL_X, + UPDATE_EXEC_MASK_Y, + UPDATE_PREDICATE_Y, + WRITE_Y, + OMOD_Y, + DST_REL_Y, + CLAMP_Y, + SRC0_Y, + SRC0_NEG_Y, + SRC0_REL_Y, + SRC0_ABS_Y, + SRC0_SEL_Y, + SRC1_Y, + SRC1_NEG_Y, + SRC1_REL_Y, + SRC1_ABS_Y, + SRC1_SEL_Y, + PRED_SEL_Y, + UPDATE_EXEC_MASK_Z, + UPDATE_PREDICATE_Z, + WRITE_Z, + OMOD_Z, + DST_REL_Z, + CLAMP_Z, + SRC0_Z, + SRC0_NEG_Z, + SRC0_REL_Z, + SRC0_ABS_Z, + SRC0_SEL_Z, + SRC1_Z, + SRC1_NEG_Z, + SRC1_REL_Z, + SRC1_ABS_Z, + SRC1_SEL_Z, + PRED_SEL_Z, + UPDATE_EXEC_MASK_W, + UPDATE_PREDICATE_W, + WRITE_W, + OMOD_W, + DST_REL_W, + CLAMP_W, + SRC0_W, + SRC0_NEG_W, + SRC0_REL_W, + SRC0_ABS_W, + SRC0_SEL_W, + SRC1_W, + SRC1_NEG_W, + SRC1_REL_W, + SRC1_ABS_W, + SRC1_SEL_W, + PRED_SEL_W, + IMM_0, + IMM_1, + VEC_COUNT + }; + } //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index bae39c5c1fc..f45e958995f 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -36,8 +36,7 @@ private: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT4_eg_pseudo: - case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT_4: return 4; case AMDGPU::KILL: return 0; @@ -71,8 +70,7 @@ private: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: case AMDGPU::COPY: - case AMDGPU::DOT4_eg_pseudo: - case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT_4: return true; default: return false; diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index f8c900f7277..a1919305111 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -182,6 +182,41 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); continue; } + case AMDGPU::DOT_4: { + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + MachineInstr *BMI = + TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Mask) { + TII->addFlag(BMI, 0, MO_FLAG_MASK); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + unsigned Opcode = BMI->getOpcode(); + // While not strictly necessary from hw point of view, we force + // all src operands of a dot4 inst to belong to the same slot. + unsigned Src0 = BMI->getOperand( + TII->getOperandIdx(Opcode, R600Operands::SRC0)) + .getReg(); + unsigned Src1 = BMI->getOperand( + TII->getOperandIdx(Opcode, R600Operands::SRC1)) + .getReg(); + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); + } + MI.eraseFromParent(); + continue; + } } bool IsReduction = TII->isReductionOp(MI.getOpcode()); @@ -268,12 +303,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::CUBE_eg_pseudo: Opcode = AMDGPU::CUBE_eg_real; break; - case AMDGPU::DOT4_r600_pseudo: - Opcode = AMDGPU::DOT4_r600_real; - break; - case AMDGPU::DOT4_eg_pseudo: - Opcode = AMDGPU::DOT4_eg_real; - break; default: break; } diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 91165a1fd9d..235e22ec705 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -631,6 +631,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const }; return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); } + case AMDGPUIntrinsic::AMDGPU_dp4: { + SDValue Args[8] = { + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(3, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(3, MVT::i32)) + }; + return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); + } case r600_read_ngroups_x: return LowerImplicitParameter(DAG, VT, DL, 0); diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 56a8caf9c94..fda8cdb1c59 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -116,9 +116,6 @@ bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { bool R600InstrInfo::isReductionOp(unsigned Opcode) const { switch(Opcode) { default: return false; - case AMDGPU::DOT4_r600_pseudo: - case AMDGPU::DOT4_eg_pseudo: - return true; } } @@ -866,6 +863,95 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB return MIB; } +#define OPERAND_CASE(Label) \ + case Label: { \ + static const R600Operands::VecOps Ops[] = \ + { \ + Label##_X, \ + Label##_Y, \ + Label##_Z, \ + Label##_W \ + }; \ + return Ops[Slot]; \ + } + +static R600Operands::VecOps +getSlotedOps(R600Operands::Ops Op, unsigned Slot) { + switch (Op) { + OPERAND_CASE(R600Operands::UPDATE_EXEC_MASK) + OPERAND_CASE(R600Operands::UPDATE_PREDICATE) + OPERAND_CASE(R600Operands::WRITE) + OPERAND_CASE(R600Operands::OMOD) + OPERAND_CASE(R600Operands::DST_REL) + OPERAND_CASE(R600Operands::CLAMP) + OPERAND_CASE(R600Operands::SRC0) + OPERAND_CASE(R600Operands::SRC0_NEG) + OPERAND_CASE(R600Operands::SRC0_REL) + OPERAND_CASE(R600Operands::SRC0_ABS) + OPERAND_CASE(R600Operands::SRC0_SEL) + OPERAND_CASE(R600Operands::SRC1) + OPERAND_CASE(R600Operands::SRC1_NEG) + OPERAND_CASE(R600Operands::SRC1_REL) + OPERAND_CASE(R600Operands::SRC1_ABS) + OPERAND_CASE(R600Operands::SRC1_SEL) + OPERAND_CASE(R600Operands::PRED_SEL) + default: + llvm_unreachable("Wrong Operand"); + } +} + +#undef OPERAND_CASE + +static int +getVecOperandIdx(R600Operands::VecOps Op) { + return 1 + Op; +} + + +MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( + MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) + const { + assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + unsigned Opcode; + const AMDGPUSubtarget &ST = TM.getSubtarget(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX) + Opcode = AMDGPU::DOT4_r600; + else + Opcode = AMDGPU::DOT4_eg; + MachineBasicBlock::iterator I = MI; + MachineOperand &Src0 = MI->getOperand( + getVecOperandIdx(getSlotedOps(R600Operands::SRC0, Slot))); + MachineOperand &Src1 = MI->getOperand( + getVecOperandIdx(getSlotedOps(R600Operands::SRC1, Slot))); + MachineInstr *MIB = buildDefaultInstruction( + MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); + static const R600Operands::Ops Operands[14] = { + R600Operands::UPDATE_EXEC_MASK, + R600Operands::UPDATE_PREDICATE, + R600Operands::WRITE, + R600Operands::OMOD, + R600Operands::DST_REL, + R600Operands::CLAMP, + R600Operands::SRC0_NEG, + R600Operands::SRC0_REL, + R600Operands::SRC0_ABS, + R600Operands::SRC0_SEL, + R600Operands::SRC1_NEG, + R600Operands::SRC1_REL, + R600Operands::SRC1_ABS, + R600Operands::SRC1_SEL, + }; + + for (unsigned i = 0; i < 14; i++) { + MachineOperand &MO = MI->getOperand( + getVecOperandIdx(getSlotedOps(Operands[i], Slot))); + assert (MO.isImm()); + setImmOperand(MIB, Operands[i], MO.getImm()); + } + MIB->getOperand(20).setImm(0); + return MIB; +} + MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, MachineBasicBlock::iterator I, unsigned DstReg, diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 5a84cd5a496..f9ccf4fbb9a 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -198,6 +198,11 @@ namespace llvm { unsigned Src0Reg, unsigned Src1Reg = 0) const; + MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, + MachineInstr *MI, + unsigned Slot, + unsigned DstReg) const; + MachineInstr *buildMovImm(MachineBasicBlock &BB, MachineBasicBlock::iterator I, unsigned DstReg, diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 22d320e4c7d..1aa2c0de525 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -593,6 +593,13 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", [SDNPVariadic] >; +def DOT4 : SDNode<"AMDGPUISD::DOT4", + SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, + SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, + SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, + [] +>; + def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; @@ -1229,17 +1236,49 @@ class CNDGE_Common inst> : R600_3OP < [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))] >; -multiclass DOT4_Common inst> { - def _pseudo : R600_REDUCTION ; - - def _real : R600_2OP ; +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +class R600_VEC2OP pattern> : InstR600 <(outs R600_Reg32:$dst), (ins +// Slot X + UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, + OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, + R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, + R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, + R600_Pred:$pred_sel_X, +// Slot Y + UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, + OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, + R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, + R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, + R600_Pred:$pred_sel_Y, +// Slot Z + UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, + OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, + R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, + R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, + R600_Pred:$pred_sel_Z, +// Slot W + UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, + OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, + R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, + R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, + R600_Pred:$pred_sel_W, + LITERAL:$literal0, LITERAL:$literal1), + "", + pattern, + AnyALU> {} } +def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 + R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, + R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, + R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, + R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; + + +class DOT4_Common inst> : R600_2OP ; + + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { multiclass CUBE_Common inst> { @@ -1412,7 +1451,7 @@ let Predicates = [isR600] in { def CNDE_r600 : CNDE_Common<0x18>; def CNDGT_r600 : CNDGT_Common<0x19>; def CNDGE_r600 : CNDGE_Common<0x1A>; - defm DOT4_r600 : DOT4_Common<0x50>; + def DOT4_r600 : DOT4_Common<0x50>; defm CUBE_r600 : CUBE_Common<0x52>; def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; @@ -1611,7 +1650,7 @@ let Predicates = [isEGorCayman] in { def CNDGE_eg : CNDGE_Common<0x1B>; def MUL_LIT_eg : MUL_LIT_Common<0x1F>; def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; - defm DOT4_eg : DOT4_Common<0xBE>; + def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; let hasSideEffects = 1 in { diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index c6709a8dc38..b1f4541f2a5 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -185,6 +185,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: return AluT_XYZW; case AMDGPU::COPY: if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) { @@ -252,8 +253,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT4_eg_pseudo: - case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT_4: return IDAlu; case AMDGPU::TEX_VTX_CONSTBUF: case AMDGPU::TEX_VTX_TEXBUF: diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 3e6504ddf86..ec89bff3c1b 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -86,7 +86,8 @@ private: if (BI->getOperand(OperandIdx).getImm() == 0) continue; unsigned Dst = BI->getOperand(0).getReg(); - if (BI->getOpcode() == AMDGPU::DOT4_r600_real) { + if (BI->getOpcode() == AMDGPU::DOT4_r600 || + BI->getOpcode() == AMDGPU::DOT4_eg) { Result[Dst] = AMDGPU::PV_X; continue; } diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll index 37c3d9d7d6d..062b7412996 100644 --- a/test/CodeGen/R600/pv.ll +++ b/test/CodeGen/R600/pv.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=r600 | FileCheck %s ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) -;CHECK-NEXT: CNDGE T{{[0-9].[XYZW]}}, PV.x +;CHECK: CNDGE * T{{[0-9].[XYZW]}}, PV.x define void @main() #0 { main_body: