From 3c698f35e0ce7e8ecb761dab6a261b98996543f7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 22 May 2014 18:09:12 +0000 Subject: [PATCH] R600: Try to convert BFE back to standard bit ops when possible. This allows existing DAG combines to work on them, and then we can re-match to BFE if necessary during instruction selection. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209462 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 21 ++++ test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll | 128 +++++++++++++++++++++++ test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll | 105 ++++++++++++++++++- test/CodeGen/R600/sext-in-reg.ll | 48 ++++++++- test/CodeGen/R600/udivrem64.ll | 6 +- 5 files changed, 297 insertions(+), 11 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index a4cb4f5c405..6c443ea828b 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -1377,6 +1377,20 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); if (OpSignBits >= SignBits) return BitsFrom; + + EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); + if (Signed) { + // This is a sign_extend_inreg. Replace it to take advantage of existing + // DAG Combines. If not eliminated, we will match back to BFE during + // selection. + + // TODO: The sext_inreg of extended types ends, although we can could + // handle them in a single BFE. + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, + DAG.getValueType(SmallVT)); + } + + return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); } if (ConstantSDNode *Val = dyn_cast(N->getOperand(0))) { @@ -1396,6 +1410,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, APInt Demanded = APInt::getBitsSet(32, OffsetVal, OffsetVal + WidthVal); + + if ((OffsetVal + WidthVal) >= 32) { + SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); + return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, + BitsFrom, ShiftVal); + } + APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll index 71d2b6e7c2e..eb509423282 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll @@ -69,6 +69,115 @@ define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i ret void } +; FUNC-LABEL: @bfe_i32_test_6 +; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: S_ENDPGM +define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_test_7 +; SI-NOT: SHL +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: The shifts should be 1 BFE +; FUNC-LABEL: @bfe_i32_test_8 +; SI: BUFFER_LOAD_DWORD +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; SI: S_ENDPGM +define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_test_9 +; SI-NOT: BFE +; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_test_10 +; SI-NOT: BFE +; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_test_11 +; SI-NOT: BFE +; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_test_12 +; SI-NOT: BFE +; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_i32_test_13 +; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: @bfe_i32_test_14 +; SI-NOT: LSHR +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + ; FUNC-LABEL: @bfe_i32_constant_fold_test_0 ; SI-NOT: BFE ; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 @@ -296,3 +405,22 @@ define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 ret void } + +; XXX - This should really be a single BFE, but the sext_inreg of the +; extended type i24 is never custom lowered. +; FUNC-LABEL: @bfe_sext_in_reg_i24 +; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]], +; SI: V_LSHLREV_B32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}} +; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}} +; XSI: V_BFE_I32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8 +; XSI-NOT: SHL +; XSI-NOT: SHR +; XSI: BUFFER_STORE_DWORD [[BFE]], +define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) + %shl = shl i32 %bfe, 8 + %ashr = ashr i32 %shl, 8 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll index 6ed1ad5d2e6..1a62253eeb7 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll @@ -161,9 +161,9 @@ define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrsp ; FUNC-LABEL: @bfe_u32_test_1 ; SI: BUFFER_LOAD_DWORD -; SI: V_BFE_U32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 +; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} ; SI: S_ENDPGM -; EG: BFE_UINT +; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32 addrspace(1)* %in, align 4 %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) @@ -220,7 +220,7 @@ define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw ; FUNC-LABEL: @bfe_u32_test_6 ; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: V_BFE_U32 {{v[0-9]+}}, {{v[0-9]+}}, 1, 31 +; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; SI: S_ENDPGM define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32 addrspace(1)* %in, align 4 @@ -243,8 +243,9 @@ define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw } ; FUNC-LABEL: @bfe_u32_test_8 -; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: V_BFE_U32 {{v[0-9]+}}, {{v[0-9]+}}, 31, 1 +; SI-NOT: BFE +; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; SI-NOT: BFE ; SI: S_ENDPGM define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %x = load i32 addrspace(1)* %in, align 4 @@ -254,6 +255,76 @@ define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw ret void } +; FUNC-LABEL: @bfe_u32_test_9 +; SI-NOT: BFE +; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_10 +; SI-NOT: BFE +; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_11 +; SI-NOT: BFE +; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_12 +; SI-NOT: BFE +; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_test_13 +; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: @bfe_u32_test_14 +; SI-NOT: LSHR +; SI-NOT: BFE +; SI: S_ENDPGM +define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + ; FUNC-LABEL: @bfe_u32_constant_fold_test_0 ; SI-NOT: BFE ; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 @@ -457,3 +528,27 @@ define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: @bfe_u32_constant_fold_test_17 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @bfe_u32_constant_fold_test_18 +; SI-NOT: BFE +; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0 +; SI: BUFFER_STORE_DWORD [[VREG]], +; SI: S_ENDPGM +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll index 404c9b8b812..1b02e4bf801 100644 --- a/test/CodeGen/R600/sext-in-reg.ll +++ b/test/CodeGen/R600/sext-in-reg.ll @@ -417,8 +417,8 @@ define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwi ; This really should be folded into 1 ; FUNC-LABEL: @bfe_16_bfe_8 -; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI-NOT: BFE ; SI: S_ENDPGM define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { %load = load i32 addrspace(1)* %ptr, align 4 @@ -430,7 +430,7 @@ define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwi ; Make sure there isn't a redundant BFE ; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe -; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000 +; SI: S_SEXT_I32_I8 s{{[0-9]+}}, s{{[0-9]+}} ; SI-NOT: BFE ; SI: S_ENDPGM define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { @@ -478,3 +478,47 @@ define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* % store i32 %ashr, i32 addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_0: +; SI-NOT: SHR +; SI-NOT: SHL +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; SI: S_ENDPGM +define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_1 +; SI: BUFFER_LOAD_DWORD +; SI-NOT: SHL +; SI-NOT: SHR +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 +; SI: S_ENDPGM +define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sext_in_reg_i2_bfe_offset_1: +; SI: BUFFER_LOAD_DWORD +; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}} +; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}} +; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 +; SI: S_ENDPGM +define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll index b3caebf123d..a71315a12d8 100644 --- a/test/CodeGen/R600/udivrem64.ll +++ b/test/CodeGen/R600/udivrem64.ll @@ -3,8 +3,7 @@ ;FUNC-LABEL: @test_udiv ;EG: RECIP_UINT -;EG: BFE_UINT -;EG: BFE_UINT +;EG: LSHL {{.*}}, 1, ;EG: BFE_UINT ;EG: BFE_UINT ;EG: BFE_UINT @@ -74,8 +73,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ;EG: BFE_UINT ;EG: BFE_UINT ;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT +;EG: AND_INT {{.*}}, 1, ;SI: S_ENDPGM define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { %result = urem i64 %x, %y