mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-13 04:30:23 +00:00
Reapply "R600: Add new intrinsic to read work dimensions"
This effectively reverts revert 219707. After fixing the test to work with new function name format and renamed intrinsic. Reviewed-by: Tom Stellard <tom@stellard.net> Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@219710 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
f0f98417ca
commit
d6315ea5a5
@ -33,10 +33,14 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
|
||||
"__builtin_r600_read_tgid">;
|
||||
defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
|
||||
"__builtin_r600_read_tidig">;
|
||||
|
||||
} // End TargetPrefix = "r600"
|
||||
|
||||
let TargetPrefix = "AMDGPU" in {
|
||||
|
||||
class AMDGPUReadPreloadRegisterIntrinsic<string name>
|
||||
: Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
|
||||
GCCBuiltin<name>;
|
||||
|
||||
def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">,
|
||||
// 1st parameter: Numerator
|
||||
// 2nd parameter: Denominator
|
||||
@ -72,4 +76,7 @@ def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">,
|
||||
def int_AMDGPU_ldexp : GCCBuiltin<"__builtin_amdgpu_ldexp">,
|
||||
Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_AMDGPU_read_workdim : AMDGPUReadPreloadRegisterIntrinsic <
|
||||
"__builtin_amdgpu_read_workdim">;
|
||||
|
||||
} // End TargetPrefix = "AMDGPU"
|
||||
|
@ -30,6 +30,9 @@ public:
|
||||
/// Number of bytes in the LDS that are being used.
|
||||
unsigned LDSSize;
|
||||
|
||||
/// Start of implicit kernel args
|
||||
unsigned ABIArgOffset;
|
||||
|
||||
unsigned getShaderType() const {
|
||||
return ShaderType;
|
||||
}
|
||||
|
@ -809,6 +809,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
|
||||
case Intrinsic::r600_read_local_size_z:
|
||||
return LowerImplicitParameter(DAG, VT, DL, 8);
|
||||
|
||||
case Intrinsic::AMDGPU_read_workdim:
|
||||
return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
|
||||
|
||||
case Intrinsic::r600_read_tgid_x:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
|
||||
AMDGPU::T1_X, VT);
|
||||
@ -1698,7 +1701,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
|
||||
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
|
||||
*DAG.getContext());
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
|
||||
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
|
||||
|
||||
SmallVector<ISD::InputArg, 8> LocalIns;
|
||||
|
||||
@ -1716,7 +1719,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
|
||||
MemVT = MemVT.getVectorElementType();
|
||||
}
|
||||
|
||||
if (ShaderType != ShaderType::COMPUTE) {
|
||||
if (MFI->getShaderType() != ShaderType::COMPUTE) {
|
||||
unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
|
||||
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
||||
InVals.push_back(Register);
|
||||
@ -1748,16 +1751,18 @@ SDValue R600TargetLowering::LowerFormalArguments(
|
||||
|
||||
unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
|
||||
unsigned PartOffset = VA.getLocMemOffset();
|
||||
unsigned Offset = 36 + VA.getLocMemOffset();
|
||||
|
||||
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
|
||||
SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
|
||||
DAG.getConstant(36 + PartOffset, MVT::i32),
|
||||
DAG.getConstant(Offset, MVT::i32),
|
||||
DAG.getUNDEF(MVT::i32),
|
||||
PtrInfo,
|
||||
MemVT, false, true, true, 4);
|
||||
|
||||
// 4 is the preferred alignment for the CONSTANT memory space.
|
||||
InVals.push_back(Arg);
|
||||
MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
|
||||
}
|
||||
return Chain;
|
||||
}
|
||||
|
@ -519,11 +519,11 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
if (VA.isMemLoc()) {
|
||||
VT = Ins[i].VT;
|
||||
EVT MemVT = Splits[i].VT;
|
||||
const unsigned Offset = 36 + VA.getLocMemOffset();
|
||||
// The first 36 bytes of the input buffer contains information about
|
||||
// thread group and global sizes.
|
||||
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
|
||||
36 + VA.getLocMemOffset(),
|
||||
Ins[i].Flags.isSExt());
|
||||
Offset, Ins[i].Flags.isSExt());
|
||||
|
||||
const PointerType *ParamTy =
|
||||
dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
|
||||
@ -537,6 +537,7 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
}
|
||||
|
||||
InVals.push_back(Arg);
|
||||
Info->ABIArgOffset = Offset + MemVT.getStoreSize();
|
||||
continue;
|
||||
}
|
||||
assert(VA.isRegLoc() && "Parameter must be in a register!");
|
||||
@ -927,6 +928,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
case Intrinsic::r600_read_local_size_z:
|
||||
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
|
||||
|
||||
case Intrinsic::AMDGPU_read_workdim:
|
||||
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
|
||||
MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
|
||||
false);
|
||||
|
||||
case Intrinsic::r600_read_tgid_x:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
|
||||
|
@ -128,6 +128,20 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}get_work_dim:
|
||||
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
|
||||
; EG: MOV [[VAL]], KC0[2].Z
|
||||
|
||||
; SI: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0xb
|
||||
; SI: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
|
||||
; SI: BUFFER_STORE_DWORD [[VVAL]]
|
||||
define void @get_work_dim (i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%0 = call i32 @llvm.AMDGPU.read.workdim() #0
|
||||
store i32 %0, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; The tgid values are stored in sgprs offset by the number of user sgprs.
|
||||
; Currently we always use exactly 2 user sgprs for the pointer to the
|
||||
; kernel arguments, but this may change in the future.
|
||||
@ -209,4 +223,6 @@ declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare i32 @llvm.r600.read.tidig.y() #0
|
||||
declare i32 @llvm.r600.read.tidig.z() #0
|
||||
|
||||
declare i32 @llvm.AMDGPU.read.workdim() #0
|
||||
|
||||
attributes #0 = { readnone }
|
||||
|
Loading…
Reference in New Issue
Block a user