From 38d5e1c36d954f1ff6489f58efd1d4865217cf9b Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 14 Aug 2013 23:24:32 +0000 Subject: [PATCH] R600/SI: Lower BUILD_VECTOR to REG_SEQUENCE v2 Using REG_SEQUENCE for BUILD_VECTOR rather than a series of INSERT_SUBREG instructions should make it easier for the register allocator to coalasce unnecessary copies. v2: - Use an SGPR register class if all the operands of BUILD_VECTOR are SGPRs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188427 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 88 ++++++++++++++++++++------ lib/Target/R600/AMDGPUInstructions.td | 49 -------------- lib/Target/R600/AMDGPURegisterInfo.cpp | 32 ++++------ lib/Target/R600/AMDGPURegisterInfo.h | 4 ++ lib/Target/R600/R600RegisterInfo.cpp | 10 --- lib/Target/R600/R600RegisterInfo.h | 4 -- lib/Target/R600/SIInstructions.td | 10 --- test/CodeGen/R600/si-lod-bias.ll | 50 +++++++++++++++ test/CodeGen/R600/store.ll | 2 +- 9 files changed, 137 insertions(+), 112 deletions(-) create mode 100644 test/CodeGen/R600/si-lod-bias.ll diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index d339b09b9e1..22bdb90cca0 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -285,35 +285,85 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } case ISD::BUILD_VECTOR: { + unsigned RegClassID; const AMDGPUSubtarget &ST = TM.getSubtarget(); - if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { - break; + const AMDGPURegisterInfo *TRI = + static_cast(TM.getRegisterInfo()); + const SIRegisterInfo *SIRI = + static_cast(TM.getRegisterInfo()); + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + assert(VT.getVectorElementType().bitsEq(MVT::i32)); + if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + bool UseVReg = true; + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + U != E; ++U) { + if (!U->isMachineOpcode()) { + continue; + } + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (!RC) { + continue; + } + if (SIRI->isSGPRClass(RC)) { + UseVReg = false; + } + } + switch(NumVectorElts) { + case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID : + AMDGPU::SReg_32RegClassID; + break; + case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : + AMDGPU::SReg_64RegClassID; + break; + case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : + AMDGPU::SReg_128RegClassID; + break; + case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : + AMDGPU::SReg_256RegClassID; + break; + case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : + AMDGPU::SReg_512RegClassID; + break; + } + } else { + // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + switch(NumVectorElts) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } } - unsigned RegClassID; - switch(N->getValueType(0).getVectorNumElements()) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32); + + if (NumVectorElts == 1) { + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, + VT.getVectorElementType(), + N->getOperand(0), RegClass); } - // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG - // that adds a 128 bits reg copy when going through TwoAddressInstructions - // pass. We want to avoid 128 bits copies as much as possible because they - // can't be bundled by our scheduler. - SDValue RegSeqArgs[9] = { - CurDAG->getTargetConstant(RegClassID, MVT::i32), - SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), - SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32), - SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32), - SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32) - }; + + assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + "supported yet"); + // 16 = Max Num Vector Elements + // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) + // 1 = Vector Register Class + SDValue RegSeqArgs[16 * 2 + 1]; + + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32); bool IsRegSeq = true; for (unsigned i = 0; i < N->getNumOperands(); i++) { + // XXX: Why is this here? if (dyn_cast(N->getOperand(i))) { IsRegSeq = false; break; } - RegSeqArgs[2 * i + 1] = N->getOperand(i); + RegSeqArgs[1 + (2 * i)] = N->getOperand(i); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32); } if (!IsRegSeq) break; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index d6a7759503c..ddb655add83 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -254,61 +254,12 @@ class Insert_Element ; -// Vector Build pattern -class Vector1_Build : Pat < - (vecType (build_vector elemType:$src)), - (vecType (COPY_TO_REGCLASS $src, rc)) ->; - -class Vector2_Build : Pat < - (vecType (build_vector elemType:$sub0, elemType:$sub1)), - (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1) ->; - class Vector4_Build : Pat < (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3) >; -class Vector8_Build : Pat < - (vecType (build_vector elemType:$sub0, elemType:$sub1, - elemType:$sub2, elemType:$sub3, - elemType:$sub4, elemType:$sub5, - elemType:$sub6, elemType:$sub7)), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1), - $sub2, sub2), $sub3, sub3), - $sub4, sub4), $sub5, sub5), - $sub6, sub6), $sub7, sub7) ->; - -class Vector16_Build : Pat < - (vecType (build_vector elemType:$sub0, elemType:$sub1, - elemType:$sub2, elemType:$sub3, - elemType:$sub4, elemType:$sub5, - elemType:$sub6, elemType:$sub7, - elemType:$sub8, elemType:$sub9, - elemType:$sub10, elemType:$sub11, - elemType:$sub12, elemType:$sub13, - elemType:$sub14, elemType:$sub15)), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1), - $sub2, sub2), $sub3, sub3), - $sub4, sub4), $sub5, sub5), - $sub6, sub6), $sub7, sub7), - $sub8, sub8), $sub9, sub9), - $sub10, sub10), $sub11, sub11), - $sub12, sub12), $sub13, sub13), - $sub14, sub14), $sub15, sub15) ->; - // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer // can handle COPY instructions. // bitconvert pattern diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 3402092ad29..47617a72990 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -46,27 +46,21 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { return 0; } +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { + static const unsigned SubRegs[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, + AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, + AMDGPU::sub15 + }; + + assert (Channel < array_lengthof(SubRegs)); + return SubRegs[Channel]; +} + unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { - switch(IndirectIndex) { - case 0: return AMDGPU::sub0; - case 1: return AMDGPU::sub1; - case 2: return AMDGPU::sub2; - case 3: return AMDGPU::sub3; - case 4: return AMDGPU::sub4; - case 5: return AMDGPU::sub5; - case 6: return AMDGPU::sub6; - case 7: return AMDGPU::sub7; - case 8: return AMDGPU::sub8; - case 9: return AMDGPU::sub9; - case 10: return AMDGPU::sub10; - case 11: return AMDGPU::sub11; - case 12: return AMDGPU::sub12; - case 13: return AMDGPU::sub13; - case 14: return AMDGPU::sub14; - case 15: return AMDGPU::sub15; - default: llvm_unreachable("indirect index out of range"); - } + return getSubRegFromChannel(IndirectIndex); } #define GET_REGINFO_TARGET_DESC diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 7cbd34b8a78..135d3dd0207 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -50,6 +50,10 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { assert(!"Unimplemented"); return NULL; } + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) + unsigned getSubRegFromChannel(unsigned Channel) const; + const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp index a42043b3434..4dc63fe57ea 100644 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ b/lib/Target/R600/R600RegisterInfo.cpp @@ -86,16 +86,6 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( } } -unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const { - switch (Channel) { - default: assert(!"Invalid channel index"); return 0; - case 0: return AMDGPU::sub0; - case 1: return AMDGPU::sub1; - case 2: return AMDGPU::sub2; - case 3: return AMDGPU::sub3; - } -} - const RegClassWeight &R600RegisterInfo::getRegClassWeight( const TargetRegisterClass *RC) const { return RCW; diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index 9b286ee3aba..d458e557a4e 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -43,10 +43,6 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { /// CFGStructurizer virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; - /// \returns the sub reg enum value for the given \p Channel - /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x) - unsigned getSubRegFromChannel(unsigned Channel) const; - virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const; }; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index b20d7c0533a..d4e0b033702 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1525,16 +1525,6 @@ foreach Index = 0-15 in { >; } -def : Vector1_Build ; -def : Vector2_Build ; -def : Vector2_Build ; -def : Vector4_Build ; -def : Vector4_Build ; -def : Vector8_Build ; -def : Vector8_Build ; -def : Vector16_Build ; -def : Vector16_Build ; - def : BitConvert ; def : BitConvert ; diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll new file mode 100644 index 00000000000..9b58f2ab4fe --- /dev/null +++ b/test/CodeGen/R600/si-lod-bias.ll @@ -0,0 +1,50 @@ +;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s + +; This shader has the potential to generated illeagal VGPR to SGPR copies if +; the wrong register class is used for the REG_SEQUENCE instructions. + +; CHECK: @main +; CHECK: IMAGE_SAMPLE_B VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 15, 0, 0, 0, 0, 0, 0, 0, VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}} + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %23 = getelementptr <32 x i8> addrspace(2)* %2, i32 0 + %24 = load <32 x i8> addrspace(2)* %23, !tbaa !0 + %25 = getelementptr <16 x i8> addrspace(2)* %1, i32 0 + %26 = load <16 x i8> addrspace(2)* %25, !tbaa !0 + %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) + %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) + %29 = bitcast float %22 to i32 + %30 = bitcast float %27 to i32 + %31 = bitcast float %28 to i32 + %32 = insertelement <4 x i32> undef, i32 %29, i32 0 + %33 = insertelement <4 x i32> %32, i32 %30, i32 1 + %34 = insertelement <4 x i32> %33, i32 %31, i32 2 + %35 = insertelement <4 x i32> %34, i32 undef, i32 3 + %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2) + %37 = extractelement <4 x float> %36, i32 0 + %38 = extractelement <4 x float> %36, i32 1 + %39 = extractelement <4 x float> %36, i32 2 + %40 = extractelement <4 x float> %36, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } + +!0 = metadata !{metadata !"const", null, i32 1} diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll index 1bda5e6d61e..506f0b0fb14 100644 --- a/test/CodeGen/R600/store.ll +++ b/test/CodeGen/R600/store.ll @@ -26,7 +26,7 @@ define void @store_f32(float addrspace(1)* %out, float %in) { define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { entry: %0 = insertelement <2 x float> , float %a, i32 0 - %1 = insertelement <2 x float> %0, float %b, i32 0 + %1 = insertelement <2 x float> %0, float %b, i32 1 store <2 x float> %1, <2 x float> addrspace(1)* %out ret void }