From 38aad1c16a7774415b2b408154e0aa35aebc7121 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 25 May 2015 16:15:54 +0000 Subject: [PATCH] R600/SI: Fix bug with v_interp_p1_f32 instructions on 16 bank lds chips The src and dst register cannot be the same on chips with 16 lds banks. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@238147 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPU.td | 14 ++++++-- lib/Target/R600/AMDGPUSubtarget.cpp | 1 + lib/Target/R600/AMDGPUSubtarget.h | 5 +++ lib/Target/R600/Processors.td | 25 ++++++++------ lib/Target/R600/SIInstructions.td | 23 +++++++++++-- test/CodeGen/R600/llvm.SI.fs.interp.ll | 45 +++++++++++++++++++++----- 6 files changed, 91 insertions(+), 22 deletions(-) diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index d70c16762c1..2e7e39a54d3 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -132,6 +132,15 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; +class SubtargetFeatureLDSBankCount : SubtargetFeature < + "ldsbankcount"#Value, + "LDSBankCount", + !cast(Value), + "The number of LDS banks per compute unit.">; + +def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; +def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; + class SubtargetFeatureLocalMemorySize : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", @@ -189,7 +198,8 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding]>; + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, @@ -199,7 +209,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts]>; + FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index f78e78ad63f..5288866ba66 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -72,6 +72,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), + LDSBankCount(0), FrameLowering(TargetFrameLowering::StackGrowsUp, 64 * 16, // Maximum stack alignment (long16) 0), diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 57a084e6b3e..b262cdf5771 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -76,6 +76,7 @@ private: bool GCN3Encoding; bool CIInsts; bool FeatureDisable; + int LDSBankCount; AMDGPUFrameLowering FrameLowering; std::unique_ptr TLInfo; @@ -228,6 +229,10 @@ public: return SGPRInitBug; } + int getLDSBankCount() const { + return LDSBankCount; + } + unsigned getAmdKernelCodeChipID() const; bool enableMachineScheduler() const override { diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index 82c6d13c44f..c0ffede5199 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -103,17 +103,24 @@ def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; // Sea Islands //===----------------------------------------------------------------------===// -def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>; - -def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>; - -def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>; - -def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureSeaIslands, FeatureFastFMAF32] +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] >; -def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>; +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16] +>; + +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16]>; //===----------------------------------------------------------------------===// // Volcanic Islands diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 875f9c96bf0..15c2f3ec193 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -34,6 +34,9 @@ def isSI : Predicate<"Subtarget->getGeneration() " def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; +def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; +def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; @@ -1436,13 +1439,27 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI , "v_rsq_clamp_f64", let Uses = [M0] in { // FIXME: Specify SchedRW for VINTRP insturctions. -defm V_INTERP_P1_F32 : VINTRP_m < - 0x00000000, + +multiclass V_INTERP_P1_F32_m : VINTRP_m < + 0x00000000, (outs VGPR_32:$dst), (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; + (i32 imm:$attr)))] +>; + +let OtherPredicates = [has32BankLDS] in { + +defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS] + +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { + +defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.ll b/test/CodeGen/R600/llvm.SI.fs.interp.ll index 6b36140b06c..3d05da616e4 100644 --- a/test/CodeGen/R600/llvm.SI.fs.interp.ll +++ b/test/CodeGen/R600/llvm.SI.fs.interp.ll @@ -1,11 +1,13 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s -;CHECK-NOT: s_wqm -;CHECK: s_mov_b32 -;CHECK-NEXT: v_interp_mov_f32 -;CHECK: v_interp_p1_f32 -;CHECK: v_interp_p2_f32 +;GCN-LABEL: {{^}}main: +;GCN-NOT: s_wqm +;GCN: s_mov_b32 +;GCN-NEXT: v_interp_mov_f32 +;GCN: v_interp_p1_f32 +;GCN: v_interp_p2_f32 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { main_body: @@ -16,7 +18,33 @@ main_body: ret void } -declare void @llvm.AMDGPU.shader.type(i32) +; Thest that v_interp_p1 uses different source and destination registers +; on 16 bank LDS chips. + +; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: +; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] + +define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { +main_body: + %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) + %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) + %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7) + %25 = call float @fabs(float %22) + %26 = call float @fabs(float %23) + %27 = call float @fabs(float %24) + %28 = call i32 @llvm.SI.packf16(float %25, float %26) + %29 = bitcast i32 %28 to float + %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00) + %31 = bitcast i32 %30 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31) + ret void +} + +; Function Attrs: readnone +declare float @fabs(float) #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 ; Function Attrs: nounwind readnone declare float @llvm.SI.fs.constant(i32, i32, i32) #1 @@ -28,3 +56,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind readnone } +attributes #2 = { readnone }