diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 25acaa57187..d8e424936da 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -136,8 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); - + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 1c15eb8973e..97f2d4b5935 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -1901,8 +1901,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->eraseFromParent(); continue; + case AMDGPU::S_BFE_I64: { + splitScalar64BitBFE(Worklist, Inst); + Inst->eraseFromParent(); + continue; + } + case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFE_I64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); } @@ -2167,6 +2172,65 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist Worklist.push_back(Second); } +void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + uint32_t Imm = Inst->getOperand(2).getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + // Only sext_inreg cases handled. + assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && + BitWidth <= 32 && + Offset == 0 && + "Not implemented"); + + if (BitWidth < 32) { + unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) + .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) + .addImm(31) + .addReg(MidRegLo); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(MidRegLo) + .addImm(AMDGPU::sub0) + .addReg(MidRegHi) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + return; + } + + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) + .addImm(31) + .addReg(Src.getReg(), 0, AMDGPU::sub0); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(Src.getReg(), 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); +} + void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, MachineInstr *Inst) const { // Add the implict and explicit register definitions. diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 5c5d8476235..ce32fd7fa65 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -52,6 +52,8 @@ private: void splitScalar64BitBCNT(SmallVectorImpl &Worklist, MachineInstr *Inst) const; + void splitScalar64BitBFE(SmallVectorImpl &Worklist, + MachineInstr *Inst) const; void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index b84a2b12149..2c9ffaffca8 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -350,6 +350,11 @@ class SOP2_64 op, string opName, list pattern> : SOP2 < opName#" $dst, $src0, $src1", pattern >; +class SOP2_64_32 op, string opName, list pattern> : SOP2 < + op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + class SOP2_SHIFT_64 op, string opName, list pattern> : SOP2 < op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 193a083085a..6dcb6de0797 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -297,7 +297,7 @@ def S_MUL_I32 : SOP2_32 <0x00000026, "s_mul_i32", def S_BFE_U32 : SOP2_32 <0x00000027, "s_bfe_u32", []>; def S_BFE_I32 : SOP2_32 <0x00000028, "s_bfe_i32", []>; def S_BFE_U64 : SOP2_64 <0x00000029, "s_bfe_u64", []>; -def S_BFE_I64 : SOP2_64 <0x0000002a, "s_bfe_i64", []>; +def S_BFE_I64 : SOP2_64_32 <0x0000002a, "s_bfe_i64", []>; //def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "s_cbranch_g_fork", []>; def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "s_absdiff_i32", []>; @@ -2972,30 +2972,25 @@ defm : SI_INDIRECT_Pattern ; def : Pat<(i32 (sext_inreg i32:$src, i1)), (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 -// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it -// might not be worth the effort, and will need to expand to shifts when -// fixing SGPR copies. - // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (REG_SEQUENCE SReg_64, - (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0, // 0 | 1 << 16 - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (REG_SEQUENCE SReg_64, - (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0, - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (REG_SEQUENCE SReg_64, - (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0, - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i32)), + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 >; class ZExt_i64_i32_Pat : Pat < diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll index 693ef9da375..d364e6bcae2 100644 --- a/test/CodeGen/R600/sext-in-reg.ll +++ b/test/CodeGen/R600/sext-in-reg.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: @@ -75,12 +76,13 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, } ; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: -; SI: s_mov_b32 {{s[0-9]+}}, -1 -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_bfe_i32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000 -; SI: buffer_store_dwordx2 +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = add i64 %a, %b + %c = shl i64 %a, %b %shl = shl i64 %c, 63 %ashr = ashr i64 %shl, 63 store i64 %ashr, i64 addrspace(1)* %out, align 8 @@ -88,15 +90,16 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw } ; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: -; SI: s_mov_b32 {{s[0-9]+}}, -1 -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: buffer_store_dwordx2 +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal +; EG: LSHL +; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal ; EG: ASHR [[RES_HI]] ; EG-NOT: BFE_INT ; EG: LSHR @@ -104,7 +107,7 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw ;; TODO Check address computation, using | with variables in {{}} does not work, ;; also the _LO/_HI order might be different define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = add i64 %a, %b + %c = shl i64 %a, %b %shl = shl i64 %c, 56 %ashr = ashr i64 %shl, 56 store i64 %ashr, i64 addrspace(1)* %out, align 8 @@ -112,15 +115,16 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw } ; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: -; SI: s_mov_b32 {{s[0-9]+}}, -1 -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: buffer_store_dwordx2 +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal +; EG: LSHL +; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal ; EG: ASHR [[RES_HI]] ; EG-NOT: BFE_INT ; EG: LSHR @@ -128,7 +132,7 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw ;; TODO Check address computation, using | with variables in {{}} does not work, ;; also the _LO/_HI order might be different define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = add i64 %a, %b + %c = shl i64 %a, %b %shl = shl i64 %c, 48 %ashr = ashr i64 %shl, 48 store i64 %ashr, i64 addrspace(1)* %out, align 8 @@ -136,24 +140,24 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun } ; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: -; SI: s_load_dword -; SI: s_load_dword -; SI: s_add_i32 [[ADD:s[0-9]+]], -; SI: s_ashr_i32 s{{[0-9]+}}, [[ADD]], 31 -; SI: buffer_store_dwordx2 +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] ; EG-NOT: BFE_INT -; EG: ADD_INT {{\*?}} [[RES_LO]] + ; EG: ASHR [[RES_HI]] -; EG: ADD_INT + ; EG: LSHR ; EG: LSHR ;; TODO Check address computation, using | with variables in {{}} does not work, ;; also the _LO/_HI order might be different define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = add i64 %a, %b + %c = shl i64 %a, %b %shl = shl i64 %c, 32 %ashr = ashr i64 %shl, 32 store i64 %ashr, i64 addrspace(1)* %out, align 8 @@ -175,6 +179,89 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun ; ret void ; } +; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid + %a = load i64 addrspace(1)* %a.gep, align 8 + %b = load i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 63 + %ashr = ashr i64 %shl, 63 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid + %a = load i64 addrspace(1)* %a.gep, align 8 + %b = load i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 56 + %ashr = ashr i64 %shl, 56 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid + %a = load i64 addrspace(1)* %a.gep, align 8 + %b = load i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 48 + %ashr = ashr i64 %shl, 48 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} +define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid + %a = load i64 addrspace(1)* %a.gep, align 8 + %b = load i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 32 + %ashr = ashr i64 %shl, 32 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + ; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: ; SI-NOT: {{[^@]}}bfe ; SI: s_lshl_b32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6