From 2f5565d21c937d3e0561e3c1b79c4643e82ebd13 Mon Sep 17 00:00:00 2001 From: Che-Liang Chiou Date: Thu, 10 Mar 2011 04:05:57 +0000 Subject: [PATCH] ptx: add the rest of special registers of ISA version 2.0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127397 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IntrinsicsPTX.td | 53 ++++-- lib/Target/PTX/PTXIntrinsicInstrInfo.td | 50 +++++- test/CodeGen/PTX/intrinsic.ll | 210 ++++++++++++++++++++---- 3 files changed, 257 insertions(+), 56 deletions(-) diff --git a/include/llvm/IntrinsicsPTX.td b/include/llvm/IntrinsicsPTX.td index 9e372301c79..cbcd56e5f2c 100644 --- a/include/llvm/IntrinsicsPTX.td +++ b/include/llvm/IntrinsicsPTX.td @@ -12,27 +12,50 @@ //===----------------------------------------------------------------------===// let TargetPrefix = "ptx" in { - multiclass PTXReadSpecialRegisterIntrinsic { + // FIXME Since PTX 2.0, special registers are redefined as v4i32 type + multiclass PTXReadSpecialRegisterIntrinsic_v4i16 { def _r64 : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>; def _v4i16 : Intrinsic<[llvm_v4i16_ty], [], [IntrNoMem]>; + def _x : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; + def _y : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; + def _z : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; + def _w : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; } - multiclass PTXReadSpecialSubRegisterIntrinsic { - def _x : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; - def _y : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; - def _z : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; - def _w : Intrinsic<[llvm_i16_ty], [], [IntrNoMem]>; - } + class PTXReadSpecialRegisterIntrinsic_r32 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; + + class PTXReadSpecialRegisterIntrinsic_r64 + : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>; } -defm int_ptx_read_tid : PTXReadSpecialRegisterIntrinsic; -defm int_ptx_read_tid : PTXReadSpecialSubRegisterIntrinsic; -defm int_ptx_read_ntid : PTXReadSpecialRegisterIntrinsic; -defm int_ptx_read_ntid : PTXReadSpecialSubRegisterIntrinsic; -defm int_ptx_read_ctaid : PTXReadSpecialRegisterIntrinsic; -defm int_ptx_read_ctaid : PTXReadSpecialSubRegisterIntrinsic; -defm int_ptx_read_nctaid : PTXReadSpecialRegisterIntrinsic; -defm int_ptx_read_nctaid : PTXReadSpecialSubRegisterIntrinsic; +defm int_ptx_read_tid : PTXReadSpecialRegisterIntrinsic_v4i16; +defm int_ptx_read_ntid : PTXReadSpecialRegisterIntrinsic_v4i16; + +def int_ptx_read_laneid : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_warpid : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_nwarpid : PTXReadSpecialRegisterIntrinsic_r32; + +defm int_ptx_read_ctaid : PTXReadSpecialRegisterIntrinsic_v4i16; +defm int_ptx_read_nctaid : PTXReadSpecialRegisterIntrinsic_v4i16; + +def int_ptx_read_smid : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_nsmid : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_gridid : PTXReadSpecialRegisterIntrinsic_r32; + +def int_ptx_read_lanemask_eq : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_lanemask_le : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_lanemask_lt : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_lanemask_ge : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_lanemask_gt : PTXReadSpecialRegisterIntrinsic_r32; + +def int_ptx_read_clock : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_clock64 : PTXReadSpecialRegisterIntrinsic_r64; + +def int_ptx_read_pm0 : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_pm1 : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_pm2 : PTXReadSpecialRegisterIntrinsic_r32; +def int_ptx_read_pm3 : PTXReadSpecialRegisterIntrinsic_r32; let TargetPrefix = "ptx" in def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], []>; diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td index a75c1086f7d..2f2578cb5c9 100644 --- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td +++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td @@ -13,40 +13,76 @@ // PTX Special Purpose Register Accessor Intrinsics -class PTX_READ_SPECIAL_REGISTER +class PTX_READ_SPECIAL_REGISTER_R64 : InstPTX<(outs RRegu64:$d), (ins), - !strconcat("mov.u64\t$d, ", regname), + !strconcat("mov.u64\t$d, %", regname), [(set RRegu64:$d, (intop))]>; +class PTX_READ_SPECIAL_REGISTER_R32 + : InstPTX<(outs RRegu32:$d), (ins), + !strconcat("mov.u32\t$d, %", regname), + [(set RRegu32:$d, (intop))]>; + class PTX_READ_SPECIAL_SUB_REGISTER : InstPTX<(outs RRegu16:$d), (ins), - !strconcat("mov.u16\t$d, ", regname), + !strconcat("mov.u16\t$d, %", regname), [(set RRegu16:$d, (intop))]>; -def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER<"tid", int_ptx_read_tid_r64>; +// TODO Add read vector-version of special registers + +def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid", int_ptx_read_tid_r64>; def PTX_READ_TID_X : PTX_READ_SPECIAL_SUB_REGISTER<"tid.x", int_ptx_read_tid_x>; def PTX_READ_TID_Y : PTX_READ_SPECIAL_SUB_REGISTER<"tid.y", int_ptx_read_tid_y>; def PTX_READ_TID_Z : PTX_READ_SPECIAL_SUB_REGISTER<"tid.z", int_ptx_read_tid_z>; def PTX_READ_TID_W : PTX_READ_SPECIAL_SUB_REGISTER<"tid.w", int_ptx_read_tid_w>; -def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER<"ntid", int_ptx_read_ntid_r64>; +def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid", int_ptx_read_ntid_r64>; def PTX_READ_NTID_X : PTX_READ_SPECIAL_SUB_REGISTER<"ntid.x", int_ptx_read_ntid_x>; def PTX_READ_NTID_Y : PTX_READ_SPECIAL_SUB_REGISTER<"ntid.y", int_ptx_read_ntid_y>; def PTX_READ_NTID_Z : PTX_READ_SPECIAL_SUB_REGISTER<"ntid.z", int_ptx_read_ntid_z>; def PTX_READ_NTID_W : PTX_READ_SPECIAL_SUB_REGISTER<"ntid.w", int_ptx_read_ntid_w>; -def PTX_READ_CTAID_R64 : PTX_READ_SPECIAL_REGISTER<"ctaid", int_ptx_read_ctaid_r64>; +def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid", int_ptx_read_laneid>; +def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid", int_ptx_read_warpid>; +def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", int_ptx_read_nwarpid>; + +def PTX_READ_CTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>; def PTX_READ_CTAID_X : PTX_READ_SPECIAL_SUB_REGISTER<"ctaid.x", int_ptx_read_ctaid_x>; def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_SUB_REGISTER<"ctaid.y", int_ptx_read_ctaid_y>; def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_SUB_REGISTER<"ctaid.z", int_ptx_read_ctaid_z>; def PTX_READ_CTAID_W : PTX_READ_SPECIAL_SUB_REGISTER<"ctaid.w", int_ptx_read_ctaid_w>; -def PTX_READ_NCTAID_R64 : PTX_READ_SPECIAL_REGISTER<"nctaid", int_ptx_read_nctaid_r64>; +def PTX_READ_NCTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>; def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_SUB_REGISTER<"nctaid.x", int_ptx_read_nctaid_x>; def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_SUB_REGISTER<"nctaid.y", int_ptx_read_nctaid_y>; def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_SUB_REGISTER<"nctaid.z", int_ptx_read_nctaid_z>; def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_SUB_REGISTER<"nctaid.w", int_ptx_read_nctaid_w>; +def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid", int_ptx_read_smid>; +def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", int_ptx_read_nsmid>; +def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid", int_ptx_read_gridid>; + +def PTX_READ_LANEMASK_EQ + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>; +def PTX_READ_LANEMASK_LE + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>; +def PTX_READ_LANEMASK_LT + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>; +def PTX_READ_LANEMASK_GE + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>; +def PTX_READ_LANEMASK_GT + : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>; + +def PTX_READ_CLOCK + : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>; +def PTX_READ_CLOCK64 + : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>; + +def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>; +def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>; +def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>; +def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>; + // PTX Parallel Synchronization and Communication Intrinsics def PTX_BAR_SYNC : InstPTX<(outs), (ins i32imm:$i), "bar.sync\t$i", diff --git a/test/CodeGen/PTX/intrinsic.ll b/test/CodeGen/PTX/intrinsic.ll index 804d8b5f95d..139e29ee05a 100644 --- a/test/CodeGen/PTX/intrinsic.ll +++ b/test/CodeGen/PTX/intrinsic.ll @@ -1,118 +1,237 @@ -; RUN: llc < %s -march=ptx | FileCheck %s +; RUN: llc < %s -march=ptx -mattr=+ptx20,+sm20 | FileCheck %s -define ptx_device i16 @tid_x() { -; CHECK: mov.u16 rh0, tid.x; +define ptx_device i16 @test_tid_x() { +; CHECK: mov.u16 rh0, %tid.x; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.tid.x() ret i16 %x } -define ptx_device i16 @tid_y() { -; CHECK: mov.u16 rh0, tid.y; +define ptx_device i16 @test_tid_y() { +; CHECK: mov.u16 rh0, %tid.y; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.tid.y() ret i16 %x } -define ptx_device i16 @tid_z() { -; CHECK: mov.u16 rh0, tid.z; +define ptx_device i16 @test_tid_z() { +; CHECK: mov.u16 rh0, %tid.z; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.tid.z() ret i16 %x } -define ptx_device i16 @tid_w() { -; CHECK: mov.u16 rh0, tid.w; +define ptx_device i16 @test_tid_w() { +; CHECK: mov.u16 rh0, %tid.w; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.tid.w() ret i16 %x } -define ptx_device i16 @ntid_x() { -; CHECK: mov.u16 rh0, ntid.x; +define ptx_device i16 @test_ntid_x() { +; CHECK: mov.u16 rh0, %ntid.x; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ntid.x() ret i16 %x } -define ptx_device i16 @ntid_y() { -; CHECK: mov.u16 rh0, ntid.y; +define ptx_device i16 @test_ntid_y() { +; CHECK: mov.u16 rh0, %ntid.y; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ntid.y() ret i16 %x } -define ptx_device i16 @ntid_z() { -; CHECK: mov.u16 rh0, ntid.z; +define ptx_device i16 @test_ntid_z() { +; CHECK: mov.u16 rh0, %ntid.z; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ntid.z() ret i16 %x } -define ptx_device i16 @ntid_w() { -; CHECK: mov.u16 rh0, ntid.w; +define ptx_device i16 @test_ntid_w() { +; CHECK: mov.u16 rh0, %ntid.w; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ntid.w() ret i16 %x } -define ptx_device i16 @ctaid_x() { -; CHECK: mov.u16 rh0, ctaid.x; +define ptx_device i32 @test_laneid() { +; CHECK: mov.u32 r0, %laneid; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.laneid() + ret i32 %x +} + +define ptx_device i32 @test_warpid() { +; CHECK: mov.u32 r0, %warpid; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.warpid() + ret i32 %x +} + +define ptx_device i32 @test_nwarpid() { +; CHECK: mov.u32 r0, %nwarpid; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.nwarpid() + ret i32 %x +} + +define ptx_device i16 @test_ctaid_x() { +; CHECK: mov.u16 rh0, %ctaid.x; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ctaid.x() ret i16 %x } -define ptx_device i16 @ctaid_y() { -; CHECK: mov.u16 rh0, ctaid.y; +define ptx_device i16 @test_ctaid_y() { +; CHECK: mov.u16 rh0, %ctaid.y; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ctaid.y() ret i16 %x } -define ptx_device i16 @ctaid_z() { -; CHECK: mov.u16 rh0, ctaid.z; +define ptx_device i16 @test_ctaid_z() { +; CHECK: mov.u16 rh0, %ctaid.z; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ctaid.z() ret i16 %x } -define ptx_device i16 @ctaid_w() { -; CHECK: mov.u16 rh0, ctaid.w; +define ptx_device i16 @test_ctaid_w() { +; CHECK: mov.u16 rh0, %ctaid.w; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.ctaid.w() ret i16 %x } -define ptx_device i16 @nctaid_x() { -; CHECK: mov.u16 rh0, nctaid.x; +define ptx_device i16 @test_nctaid_x() { +; CHECK: mov.u16 rh0, %nctaid.x; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.nctaid.x() ret i16 %x } -define ptx_device i16 @nctaid_y() { -; CHECK: mov.u16 rh0, nctaid.y; +define ptx_device i16 @test_nctaid_y() { +; CHECK: mov.u16 rh0, %nctaid.y; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.nctaid.y() ret i16 %x } -define ptx_device i16 @nctaid_z() { -; CHECK: mov.u16 rh0, nctaid.z; +define ptx_device i16 @test_nctaid_z() { +; CHECK: mov.u16 rh0, %nctaid.z; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.nctaid.z() ret i16 %x } -define ptx_device i16 @nctaid_w() { -; CHECK: mov.u16 rh0, nctaid.w; +define ptx_device i16 @test_nctaid_w() { +; CHECK: mov.u16 rh0, %nctaid.w; ; CHECK-NEXT: ret; %x = call i16 @llvm.ptx.read.nctaid.w() ret i16 %x } -define ptx_device void @bar_sync() { +define ptx_device i32 @test_smid() { +; CHECK: mov.u32 r0, %smid; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.smid() + ret i32 %x +} + +define ptx_device i32 @test_nsmid() { +; CHECK: mov.u32 r0, %nsmid; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.nsmid() + ret i32 %x +} + +define ptx_device i32 @test_gridid() { +; CHECK: mov.u32 r0, %gridid; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.gridid() + ret i32 %x +} + +define ptx_device i32 @test_lanemask_eq() { +; CHECK: mov.u32 r0, %lanemask_eq; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.lanemask.eq() + ret i32 %x +} + +define ptx_device i32 @test_lanemask_le() { +; CHECK: mov.u32 r0, %lanemask_le; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.lanemask.le() + ret i32 %x +} + +define ptx_device i32 @test_lanemask_lt() { +; CHECK: mov.u32 r0, %lanemask_lt; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.lanemask.lt() + ret i32 %x +} + +define ptx_device i32 @test_lanemask_ge() { +; CHECK: mov.u32 r0, %lanemask_ge; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.lanemask.ge() + ret i32 %x +} + +define ptx_device i32 @test_lanemask_gt() { +; CHECK: mov.u32 r0, %lanemask_gt; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.lanemask.gt() + ret i32 %x +} + +define ptx_device i32 @test_clock() { +; CHECK: mov.u32 r0, %clock; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.clock() + ret i32 %x +} + +define ptx_device i64 @test_clock64() { +; CHECK: mov.u64 rd0, %clock64; +; CHECK-NEXT: ret; + %x = call i64 @llvm.ptx.read.clock64() + ret i64 %x +} + +define ptx_device i32 @test_pm0() { +; CHECK: mov.u32 r0, %pm0; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.pm0() + ret i32 %x +} + +define ptx_device i32 @test_pm1() { +; CHECK: mov.u32 r0, %pm1; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.pm1() + ret i32 %x +} + +define ptx_device i32 @test_pm2() { +; CHECK: mov.u32 r0, %pm2; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.pm2() + ret i32 %x +} + +define ptx_device i32 @test_pm3() { +; CHECK: mov.u32 r0, %pm3; +; CHECK-NEXT: ret; + %x = call i32 @llvm.ptx.read.pm3() + ret i32 %x +} + +define ptx_device void @test_bar_sync() { ; CHECK: bar.sync 0 ; CHECK-NEXT: ret; call void @llvm.ptx.bar.sync(i32 0) @@ -127,6 +246,11 @@ declare i16 @llvm.ptx.read.ntid.x() declare i16 @llvm.ptx.read.ntid.y() declare i16 @llvm.ptx.read.ntid.z() declare i16 @llvm.ptx.read.ntid.w() + +declare i32 @llvm.ptx.read.laneid() +declare i32 @llvm.ptx.read.warpid() +declare i32 @llvm.ptx.read.nwarpid() + declare i16 @llvm.ptx.read.ctaid.x() declare i16 @llvm.ptx.read.ctaid.y() declare i16 @llvm.ptx.read.ctaid.z() @@ -136,4 +260,22 @@ declare i16 @llvm.ptx.read.nctaid.y() declare i16 @llvm.ptx.read.nctaid.z() declare i16 @llvm.ptx.read.nctaid.w() +declare i32 @llvm.ptx.read.smid() +declare i32 @llvm.ptx.read.nsmid() +declare i32 @llvm.ptx.read.gridid() + +declare i32 @llvm.ptx.read.lanemask.eq() +declare i32 @llvm.ptx.read.lanemask.le() +declare i32 @llvm.ptx.read.lanemask.lt() +declare i32 @llvm.ptx.read.lanemask.ge() +declare i32 @llvm.ptx.read.lanemask.gt() + +declare i32 @llvm.ptx.read.clock() +declare i64 @llvm.ptx.read.clock64() + +declare i32 @llvm.ptx.read.pm0() +declare i32 @llvm.ptx.read.pm1() +declare i32 @llvm.ptx.read.pm2() +declare i32 @llvm.ptx.read.pm3() + declare void @llvm.ptx.bar.sync(i32 %i)