mirror of
				https://github.com/c64scene-ar/llvm-6502.git
				synced 2025-10-26 02:22:29 +00:00 
			
		
		
		
	[NVPTX] Update the usage document
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194812 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
		| @@ -66,6 +66,8 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not. | |||||||
| When compiled, the PTX kernel functions are callable by host-side code. | When compiled, the PTX kernel functions are callable by host-side code. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | .. _address_spaces: | ||||||
|  |  | ||||||
| Address Spaces | Address Spaces | ||||||
| -------------- | -------------- | ||||||
|  |  | ||||||
| @@ -103,6 +105,25 @@ space in LLVM, so the ``addrspace(N)`` annotation is *required* for global | |||||||
| variables. | variables. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Triples | ||||||
|  | ------- | ||||||
|  |  | ||||||
|  | The NVPTX target uses the module triple to select between 32/64-bit code | ||||||
|  | generation and the driver-compiler interface to use. The triple architecture | ||||||
|  | can be one of ``nvptx`` (32-bit PTX) or ``nvptx64`` (64-bit PTX). The | ||||||
|  | operating system should be one of ``cuda`` or ``nvcl``, which determines the | ||||||
|  | interface used by the generated code to communicate with the driver.  Most | ||||||
|  | users will want to use ``cuda`` as the operating system, which makes the | ||||||
|  | generated PTX compatible with the CUDA Driver API. | ||||||
|  |  | ||||||
|  | Example: 32-bit PTX for CUDA Driver API: ``nvptx-nvidia-cuda`` | ||||||
|  |  | ||||||
|  | Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda`` | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | .. _nvptx_intrinsics: | ||||||
|  |  | ||||||
| NVPTX Intrinsics | NVPTX Intrinsics | ||||||
| ================ | ================ | ||||||
|  |  | ||||||
| @@ -238,6 +259,116 @@ For the full set of NVPTX intrinsics, please see the | |||||||
| ``include/llvm/IR/IntrinsicsNVVM.td`` file in the LLVM source tree. | ``include/llvm/IR/IntrinsicsNVVM.td`` file in the LLVM source tree. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | .. _libdevice: | ||||||
|  |  | ||||||
|  | Linking with Libdevice | ||||||
|  | ====================== | ||||||
|  |  | ||||||
|  | The CUDA Toolkit comes with an LLVM bitcode library called ``libdevice`` that | ||||||
|  | implements many common mathematical functions. This library can be used as a | ||||||
|  | high-performance math library for any compilers using the LLVM NVPTX target. | ||||||
|  | The library can be found under ``nvvm/libdevice/`` in the CUDA Toolkit and | ||||||
|  | there is a separate version for each compute architecture. | ||||||
|  |  | ||||||
|  | For a list of all math functions implemented in libdevice, see | ||||||
|  | `libdevice Users Guide <http://docs.nvidia.com/cuda/libdevice-users-guide/index.html>`_. | ||||||
|  |  | ||||||
|  | To accomodate various math-related compiler flags that can affect code | ||||||
|  | generation of libdevice code, the library code depends on a special LLVM IR | ||||||
|  | pass (``NVVMReflect``) to handle conditional compilation within LLVM IR. This | ||||||
|  | pass looks for calls to the ``@__nvvm_reflect`` function and replaces them | ||||||
|  | with constants based on the defined reflection parameters. Such conditional | ||||||
|  | code often follows a pattern: | ||||||
|  |  | ||||||
|  | .. code-block:: c++ | ||||||
|  |  | ||||||
|  |   float my_function(float a) { | ||||||
|  |     if (__nvvm_reflect("FASTMATH")) | ||||||
|  |       return my_function_fast(a); | ||||||
|  |     else | ||||||
|  |       return my_function_precise(a); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | The default value for all unspecified reflection parameters is zero.  | ||||||
|  |  | ||||||
|  | The ``NVVMReflect`` pass should be executed early in the optimization | ||||||
|  | pipeline, immediately after the link stage. The ``internalize`` pass is also | ||||||
|  | recommended to remove unused math functions from the resulting PTX. For an | ||||||
|  | input IR module ``module.bc``, the following compilation flow is recommended: | ||||||
|  |  | ||||||
|  | 1. Save list of external functions in ``module.bc`` | ||||||
|  | 2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc`` | ||||||
|  | 3. Internalize all functions not in list from (1) | ||||||
|  | 4. Eliminate all unused internal functions | ||||||
|  | 5. Run ``NVVMReflect`` pass | ||||||
|  | 6. Run standard optimization pipeline | ||||||
|  |  | ||||||
|  | .. note:: | ||||||
|  |  | ||||||
|  |   ``linkonce`` and ``linkonce_odr`` linkage types are not suitable for the | ||||||
|  |   libdevice functions. It is possible to link two IR modules that have been | ||||||
|  |   linked against libdevice using different reflection variables. | ||||||
|  |  | ||||||
|  | Since the ``NVVMReflect`` pass replaces conditionals with constants, it will | ||||||
|  | often leave behind dead code of the form: | ||||||
|  |  | ||||||
|  | .. code-block:: llvm | ||||||
|  |  | ||||||
|  |   entry: | ||||||
|  |     .. | ||||||
|  |     br i1 true, label %foo, label %bar | ||||||
|  |   foo: | ||||||
|  |     .. | ||||||
|  |   bar: | ||||||
|  |     ; Dead code | ||||||
|  |     .. | ||||||
|  |  | ||||||
|  | Therefore, it is recommended that ``NVVMReflect`` is executed early in the | ||||||
|  | optimization pipeline before dead-code elimination. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Reflection Parameters | ||||||
|  | --------------------- | ||||||
|  |  | ||||||
|  | The libdevice library currently uses the following reflection parameters to | ||||||
|  | control code generation: | ||||||
|  |  | ||||||
|  | ==================== ====================================================== | ||||||
|  | Flag                 Description | ||||||
|  | ==================== ====================================================== | ||||||
|  | ``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero | ||||||
|  | ==================== ====================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Invoking NVVMReflect | ||||||
|  | -------------------- | ||||||
|  |  | ||||||
|  | To ensure that all dead code caused by the reflection pass is eliminated, it | ||||||
|  | is recommended that the reflection pass is executed early in the LLVM IR | ||||||
|  | optimization pipeline. The pass takes an optional mapping of reflection | ||||||
|  | parameter name to an integer value. This mapping can be specified as either a | ||||||
|  | command-line option to ``opt`` or as an LLVM ``StringMap<int>`` object when | ||||||
|  | programmatically creating a pass pipeline. | ||||||
|  |  | ||||||
|  | With ``opt``: | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   # opt -nvvm-reflect -nvvm-reflect-list=<var>=<value>,<var>=<value> module.bc -o module.reflect.bc | ||||||
|  |  | ||||||
|  |  | ||||||
|  | With programmatic pass pipeline: | ||||||
|  |  | ||||||
|  | .. code-block:: c++ | ||||||
|  |  | ||||||
|  |   extern ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping); | ||||||
|  |  | ||||||
|  |   StringMap<int> ReflectParams; | ||||||
|  |   ReflectParams["__CUDA_FTZ"] = 1; | ||||||
|  |   Passes.add(createNVVMReflectPass(ReflectParams)); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| Executing PTX | Executing PTX | ||||||
| ============= | ============= | ||||||
|  |  | ||||||
| @@ -274,3 +405,576 @@ JIT compiling a PTX string to a device binary: | |||||||
|  |  | ||||||
| For full examples of executing PTX assembly, please see the `CUDA Samples | For full examples of executing PTX assembly, please see the `CUDA Samples | ||||||
| <https://developer.nvidia.com/cuda-downloads>`_ distribution. | <https://developer.nvidia.com/cuda-downloads>`_ distribution. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Common Issues | ||||||
|  | ============= | ||||||
|  |  | ||||||
|  | ptxas complains of undefined function: __nvvm_reflect | ||||||
|  | ----------------------------------------------------- | ||||||
|  |  | ||||||
|  | When linking with libdevice, the ``NVVMReflect`` pass must be used. See | ||||||
|  | :ref:`libdevice` for more information. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Tutorial: A Simple Compute Kernel | ||||||
|  | ================================= | ||||||
|  |  | ||||||
|  | To start, let us take a look at a simple compute kernel written directly in | ||||||
|  | LLVM IR. The kernel implements vector addition, where each thread computes one | ||||||
|  | element of the output vector C from the input vectors A and B.  To make this | ||||||
|  | easier, we also assume that only a single CTA (thread block) will be launched, | ||||||
|  | and that it will be one dimensional. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | The Kernel | ||||||
|  | ---------- | ||||||
|  |  | ||||||
|  | .. code-block:: llvm | ||||||
|  |  | ||||||
|  |   target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | ||||||
|  |   target triple = "nvptx64-nvidia-cuda" | ||||||
|  |  | ||||||
|  |   ; Intrinsic to read X component of thread ID | ||||||
|  |   declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind | ||||||
|  |  | ||||||
|  |   define void @kernel(float addrspace(1)* %A, | ||||||
|  |                       float addrspace(1)* %B, | ||||||
|  |                       float addrspace(1)* %C) { | ||||||
|  |   entry: | ||||||
|  |     ; What is my ID? | ||||||
|  |     %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind | ||||||
|  |  | ||||||
|  |     ; Compute pointers into A, B, and C | ||||||
|  |     %ptrA = getelementptr float addrspace(1)* %A, i32 %id | ||||||
|  |     %ptrB = getelementptr float addrspace(1)* %B, i32 %id | ||||||
|  |     %ptrC = getelementptr float addrspace(1)* %C, i32 %id | ||||||
|  |  | ||||||
|  |     ; Read A, B | ||||||
|  |     %valA = load float addrspace(1)* %ptrA, align 4 | ||||||
|  |     %valB = load float addrspace(1)* %ptrB, align 4 | ||||||
|  |  | ||||||
|  |     ; Compute C = A + B | ||||||
|  |     %valC = fadd float %valA, %valB | ||||||
|  |  | ||||||
|  |     ; Store back to C | ||||||
|  |     store float %valC, float addrspace(1)* %ptrC, align 4 | ||||||
|  |  | ||||||
|  |     ret void | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   !nvvm.annotations = !{!0} | ||||||
|  |   !0 = metadata !{void (float addrspace(1)*, | ||||||
|  |                         float addrspace(1)*, | ||||||
|  |                         float addrspace(1)*)* @kernel, metadata !"kernel", i32 1} | ||||||
|  |  | ||||||
|  |  | ||||||
|  | We can use the LLVM ``llc`` tool to directly run the NVPTX code generator: | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   # llc -mcpu=sm_20 kernel.ll -o kernel.ptx | ||||||
|  |  | ||||||
|  |  | ||||||
|  | .. note:: | ||||||
|  |  | ||||||
|  |   If you want to generate 32-bit code, change ``p:64:64:64`` to ``p:32:32:32`` | ||||||
|  |   in the module data layout string and use ``nvptx64-nvidia-cuda`` as the | ||||||
|  |   target triple. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | The output we get from ``llc`` (as of LLVM 3.4): | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   // | ||||||
|  |   // Generated by LLVM NVPTX Back-End | ||||||
|  |   // | ||||||
|  |  | ||||||
|  |   .version 3.1 | ||||||
|  |   .target sm_20 | ||||||
|  |   .address_size 64 | ||||||
|  |  | ||||||
|  |     // .globl kernel | ||||||
|  |                                           // @kernel | ||||||
|  |   .visible .entry kernel( | ||||||
|  |     .param .u64 kernel_param_0, | ||||||
|  |     .param .u64 kernel_param_1, | ||||||
|  |     .param .u64 kernel_param_2 | ||||||
|  |   ) | ||||||
|  |   { | ||||||
|  |     .reg .f32   %f<4>; | ||||||
|  |     .reg .s32   %r<2>; | ||||||
|  |     .reg .s64   %rl<8>; | ||||||
|  |  | ||||||
|  |   // BB#0:                                // %entry | ||||||
|  |     ld.param.u64    %rl1, [kernel_param_0]; | ||||||
|  |     mov.u32         %r1, %tid.x; | ||||||
|  |     mul.wide.s32    %rl2, %r1, 4; | ||||||
|  |     add.s64         %rl3, %rl1, %rl2; | ||||||
|  |     ld.param.u64    %rl4, [kernel_param_1]; | ||||||
|  |     add.s64         %rl5, %rl4, %rl2; | ||||||
|  |     ld.param.u64    %rl6, [kernel_param_2]; | ||||||
|  |     add.s64         %rl7, %rl6, %rl2; | ||||||
|  |     ld.global.f32   %f1, [%rl3]; | ||||||
|  |     ld.global.f32   %f2, [%rl5]; | ||||||
|  |     add.f32         %f3, %f1, %f2; | ||||||
|  |     st.global.f32   [%rl7], %f3; | ||||||
|  |     ret; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Dissecting the Kernel | ||||||
|  | --------------------- | ||||||
|  |  | ||||||
|  | Now let us dissect the LLVM IR that makes up this kernel.  | ||||||
|  |  | ||||||
|  | Data Layout | ||||||
|  | ^^^^^^^^^^^ | ||||||
|  |  | ||||||
|  | The data layout string determines the size in bits of common data types, their | ||||||
|  | ABI alignment, and their storage size.  For NVPTX, you should use one of the | ||||||
|  | following: | ||||||
|  |  | ||||||
|  | 32-bit PTX: | ||||||
|  |  | ||||||
|  | .. code-block:: llvm | ||||||
|  |  | ||||||
|  |   target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | ||||||
|  |  | ||||||
|  | 64-bit PTX: | ||||||
|  |  | ||||||
|  | .. code-block:: llvm | ||||||
|  |  | ||||||
|  |   target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Target Intrinsics | ||||||
|  | ^^^^^^^^^^^^^^^^^ | ||||||
|  |  | ||||||
|  | In this example, we use the ``@llvm.nvvm.read.ptx.sreg.tid.x`` intrinsic to | ||||||
|  | read the X component of the current thread's ID, which corresponds to a read | ||||||
|  | of register ``%tid.x`` in PTX. The NVPTX back-end supports a large set of | ||||||
|  | intrinsics.  A short list is shown below; please see | ||||||
|  | ``include/llvm/IR/IntrinsicsNVVM.td`` for the full list. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ================================================ ==================== | ||||||
|  | Intrinsic                                        CUDA Equivalent | ||||||
|  | ================================================ ==================== | ||||||
|  | ``i32 @llvm.nvvm.read.ptx.sreg.tid.{x,y,z}``     threadIdx.{x,y,z} | ||||||
|  | ``i32 @llvm.nvvm.read.ptx.sreg.ctaid.{x,y,z}``   blockIdx.{x,y,z} | ||||||
|  | ``i32 @llvm.nvvm.read.ptx.sreg.ntid.{x,y,z}``    blockDim.{x,y,z} | ||||||
|  | ``i32 @llvm.nvvm.read.ptx.sreg.nctaid.{x,y,z}``  gridDim.{x,y,z} | ||||||
|  | ``void @llvm.cuda.syncthreads()``                __syncthreads() | ||||||
|  | ================================================ ==================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Address Spaces | ||||||
|  | ^^^^^^^^^^^^^^ | ||||||
|  |  | ||||||
|  | You may have noticed that all of the pointer types in the LLVM IR example had | ||||||
|  | an explicit address space specifier. What is address space 1? NVIDIA GPU | ||||||
|  | devices (generally) have four types of memory: | ||||||
|  |  | ||||||
|  | - Global: Large, off-chip memory | ||||||
|  | - Shared: Small, on-chip memory shared among all threads in a CTA | ||||||
|  | - Local: Per-thread, private memory | ||||||
|  | - Constant: Read-only memory shared across all threads | ||||||
|  |  | ||||||
|  | These different types of memory are represented in LLVM IR as address spaces. | ||||||
|  | There is also a fifth address space used by the NVPTX code generator that | ||||||
|  | corresponds to the "generic" address space.  This address space can represent | ||||||
|  | addresses in any other address space (with a few exceptions).  This allows | ||||||
|  | users to write IR functions that can load/store memory using the same | ||||||
|  | instructions. Intrinsics are provided to convert pointers between the generic | ||||||
|  | and non-generic address spaces. | ||||||
|  |  | ||||||
|  | See :ref:`address_spaces` and :ref:`nvptx_intrinsics` for more information. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Kernel Metadata | ||||||
|  | ^^^^^^^^^^^^^^^ | ||||||
|  |  | ||||||
|  | In PTX, a function can be either a `kernel` function (callable from the host | ||||||
|  | program), or a `device` function (callable only from GPU code). You can think | ||||||
|  | of `kernel` functions as entry-points in the GPU program. To mark an LLVM IR | ||||||
|  | function as a `kernel` function, we make use of special LLVM metadata. The | ||||||
|  | NVPTX back-end will look for a named metadata node called | ||||||
|  | ``nvvm.annotations``. This named metadata must contain a list of metadata that | ||||||
|  | describe the IR. For our purposes, we need to declare a metadata node that | ||||||
|  | assigns the "kernel" attribute to the LLVM IR function that should be emitted | ||||||
|  | as a PTX `kernel` function. These metadata nodes take the form: | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   metadata !{<function ref>, metadata !"kernel", i32 1} | ||||||
|  |  | ||||||
|  | For the previous example, we have: | ||||||
|  |  | ||||||
|  | .. code-block:: llvm | ||||||
|  |  | ||||||
|  |   !nvvm.annotations = !{!0} | ||||||
|  |   !0 = metadata !{void (float addrspace(1)*, | ||||||
|  |                         float addrspace(1)*, | ||||||
|  |                         float addrspace(1)*)* @kernel, metadata !"kernel", i32 1} | ||||||
|  |  | ||||||
|  | Here, we have a single metadata declaration in ``nvvm.annotations``. This | ||||||
|  | metadata annotates our ``@kernel`` function with the ``kernel`` attribute. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Running the Kernel | ||||||
|  | ------------------ | ||||||
|  |  | ||||||
|  | Generating PTX from LLVM IR is all well and good, but how do we execute it on | ||||||
|  | a real GPU device? The CUDA Driver API provides a convenient mechanism for | ||||||
|  | loading and JIT compiling PTX to a native GPU device, and launching a kernel. | ||||||
|  | The API is similar to OpenCL.  A simple example showing how to load and | ||||||
|  | execute our vector addition code is shown below. Note that for brevity this | ||||||
|  | code does not perform much error checking! | ||||||
|  |  | ||||||
|  | .. note:: | ||||||
|  |  | ||||||
|  |   You can also use the ``ptxas`` tool provided by the CUDA Toolkit to offline | ||||||
|  |   compile PTX to machine code (SASS) for a specific GPU architecture. Such | ||||||
|  |   binaries can be loaded by the CUDA Driver API in the same way as PTX. This | ||||||
|  |   can be useful for reducing startup time by precompiling the PTX kernels. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | .. code-block:: c++ | ||||||
|  |  | ||||||
|  |   #include <iostream> | ||||||
|  |   #include <fstream> | ||||||
|  |   #include <cassert> | ||||||
|  |   #include "cuda.h" | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   void checkCudaErrors(CUresult err) { | ||||||
|  |     assert(err == CUDA_SUCCESS); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   /// main - Program entry point | ||||||
|  |   int main(int argc, char **argv) { | ||||||
|  |     CUdevice    device; | ||||||
|  |     CUmodule    cudaModule; | ||||||
|  |     CUcontext   context; | ||||||
|  |     CUfunction  function; | ||||||
|  |     CUlinkState linker; | ||||||
|  |     int         devCount; | ||||||
|  |  | ||||||
|  |     // CUDA initialization | ||||||
|  |     checkCudaErrors(cuInit(0)); | ||||||
|  |     checkCudaErrors(cuDeviceGetCount(&devCount)); | ||||||
|  |     checkCudaErrors(cuDeviceGet(&device, 0)); | ||||||
|  |  | ||||||
|  |     char name[128]; | ||||||
|  |     checkCudaErrors(cuDeviceGetName(name, 128, device)); | ||||||
|  |     std::cout << "Using CUDA Device [0]: " << name << "\n"; | ||||||
|  |  | ||||||
|  |     int devMajor, devMinor; | ||||||
|  |     checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); | ||||||
|  |     std::cout << "Device Compute Capability: " | ||||||
|  |               << devMajor << "." << devMinor << "\n"; | ||||||
|  |     if (devMajor < 2) { | ||||||
|  |       std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; | ||||||
|  |       return 1; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     std::ifstream t("kernel.ptx"); | ||||||
|  |     if (!t.is_open()) { | ||||||
|  |       std::cerr << "kernel.ptx not found\n"; | ||||||
|  |       return 1; | ||||||
|  |     } | ||||||
|  |     std::string str((std::istreambuf_iterator<char>(t)), | ||||||
|  |                       std::istreambuf_iterator<char>()); | ||||||
|  |  | ||||||
|  |     // Create driver context | ||||||
|  |     checkCudaErrors(cuCtxCreate(&context, 0, device)); | ||||||
|  |  | ||||||
|  |     // Create module for object | ||||||
|  |     checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0)); | ||||||
|  |  | ||||||
|  |     // Get kernel function | ||||||
|  |     checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel")); | ||||||
|  |  | ||||||
|  |     // Device data | ||||||
|  |     CUdeviceptr devBufferA; | ||||||
|  |     CUdeviceptr devBufferB; | ||||||
|  |     CUdeviceptr devBufferC; | ||||||
|  |  | ||||||
|  |     checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float)*16)); | ||||||
|  |     checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float)*16)); | ||||||
|  |     checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float)*16)); | ||||||
|  |  | ||||||
|  |     float* hostA = new float[16]; | ||||||
|  |     float* hostB = new float[16]; | ||||||
|  |     float* hostC = new float[16]; | ||||||
|  |  | ||||||
|  |     // Populate input | ||||||
|  |     for (unsigned i = 0; i != 16; ++i) { | ||||||
|  |       hostA[i] = (float)i; | ||||||
|  |       hostB[i] = (float)(2*i); | ||||||
|  |       hostC[i] = 0.0f; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     checkCudaErrors(cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float)*16)); | ||||||
|  |     checkCudaErrors(cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float)*16)); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     unsigned blockSizeX = 16; | ||||||
|  |     unsigned blockSizeY = 1; | ||||||
|  |     unsigned blockSizeZ = 1; | ||||||
|  |     unsigned gridSizeX  = 1; | ||||||
|  |     unsigned gridSizeY  = 1; | ||||||
|  |     unsigned gridSizeZ  = 1; | ||||||
|  |  | ||||||
|  |     // Kernel parameters | ||||||
|  |     void *KernelParams[] = { &devBufferA, &devBufferB, &devBufferC }; | ||||||
|  |  | ||||||
|  |     std::cout << "Launching kernel\n"; | ||||||
|  |  | ||||||
|  |     // Kernel launch | ||||||
|  |     checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ, | ||||||
|  |                                    blockSizeX, blockSizeY, blockSizeZ, | ||||||
|  |                                    0, NULL, KernelParams, NULL)); | ||||||
|  |  | ||||||
|  |     // Retrieve device data | ||||||
|  |     checkCudaErrors(cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float)*16)); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     std::cout << "Results:\n"; | ||||||
|  |     for (unsigned i = 0; i != 16; ++i) { | ||||||
|  |       std::cout << hostA[i] << " + " << hostB[i] << " = " << hostC[i] << "\n"; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     // Clean up after ourselves | ||||||
|  |     delete [] hostA; | ||||||
|  |     delete [] hostB; | ||||||
|  |     delete [] hostC; | ||||||
|  |  | ||||||
|  |     // Clean-up | ||||||
|  |     checkCudaErrors(cuMemFree(devBufferA)); | ||||||
|  |     checkCudaErrors(cuMemFree(devBufferB)); | ||||||
|  |     checkCudaErrors(cuMemFree(devBufferC)); | ||||||
|  |     checkCudaErrors(cuModuleUnload(cudaModule)); | ||||||
|  |     checkCudaErrors(cuCtxDestroy(context)); | ||||||
|  |  | ||||||
|  |     return 0; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | You will need to link with the CUDA driver and specify the path to cuda.h. | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   # clang++ sample.cpp -o sample -O2 -g -I/usr/local/cuda-5.5/include -lcuda | ||||||
|  |  | ||||||
|  | We don't need to specify a path to ``libcuda.so`` since this is installed in a | ||||||
|  | system location by the driver, not the CUDA toolkit. | ||||||
|  |  | ||||||
|  | If everything goes as planned, you should see the following output when | ||||||
|  | running the compiled program: | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   Using CUDA Device [0]: GeForce GTX 680 | ||||||
|  |   Device Compute Capability: 3.0 | ||||||
|  |   Launching kernel | ||||||
|  |   Results: | ||||||
|  |   0 + 0 = 0 | ||||||
|  |   1 + 2 = 3 | ||||||
|  |   2 + 4 = 6 | ||||||
|  |   3 + 6 = 9 | ||||||
|  |   4 + 8 = 12 | ||||||
|  |   5 + 10 = 15 | ||||||
|  |   6 + 12 = 18 | ||||||
|  |   7 + 14 = 21 | ||||||
|  |   8 + 16 = 24 | ||||||
|  |   9 + 18 = 27 | ||||||
|  |   10 + 20 = 30 | ||||||
|  |   11 + 22 = 33 | ||||||
|  |   12 + 24 = 36 | ||||||
|  |   13 + 26 = 39 | ||||||
|  |   14 + 28 = 42 | ||||||
|  |   15 + 30 = 45 | ||||||
|  |  | ||||||
|  | .. note:: | ||||||
|  |  | ||||||
|  |   You will likely see a different device identifier based on your hardware | ||||||
|  |  | ||||||
|  |  | ||||||
|  | Tutorial: Linking with Libdevice | ||||||
|  | ================================ | ||||||
|  |  | ||||||
|  | In this tutorial, we show a simple example of linking LLVM IR with the | ||||||
|  | libdevice library. We will use the same kernel as the previous tutorial, | ||||||
|  | except that we will compute ``C = pow(A, B)`` instead of ``C = A + B``. | ||||||
|  | Libdevice provides an ``__nv_powf`` function that we will use. | ||||||
|  |  | ||||||
|  | .. code-block:: llvm | ||||||
|  |  | ||||||
|  |   target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | ||||||
|  |   target triple = "nvptx64-nvidia-cuda" | ||||||
|  |  | ||||||
|  |   ; Intrinsic to read X component of thread ID | ||||||
|  |   declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind | ||||||
|  |   ; libdevice function | ||||||
|  |   declare float @__nv_powf(float, float) | ||||||
|  |  | ||||||
|  |   define void @kernel(float addrspace(1)* %A, | ||||||
|  |                       float addrspace(1)* %B, | ||||||
|  |                       float addrspace(1)* %C) { | ||||||
|  |   entry: | ||||||
|  |     ; What is my ID? | ||||||
|  |     %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind | ||||||
|  |  | ||||||
|  |     ; Compute pointers into A, B, and C | ||||||
|  |     %ptrA = getelementptr float addrspace(1)* %A, i32 %id | ||||||
|  |     %ptrB = getelementptr float addrspace(1)* %B, i32 %id | ||||||
|  |     %ptrC = getelementptr float addrspace(1)* %C, i32 %id | ||||||
|  |  | ||||||
|  |     ; Read A, B | ||||||
|  |     %valA = load float addrspace(1)* %ptrA, align 4 | ||||||
|  |     %valB = load float addrspace(1)* %ptrB, align 4 | ||||||
|  |  | ||||||
|  |     ; Compute C = pow(A, B) | ||||||
|  |     %valC = call float @__nv_exp2f(float %valA, float %valB) | ||||||
|  |  | ||||||
|  |     ; Store back to C | ||||||
|  |     store float %valC, float addrspace(1)* %ptrC, align 4 | ||||||
|  |  | ||||||
|  |     ret void | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   !nvvm.annotations = !{!0} | ||||||
|  |   !0 = metadata !{void (float addrspace(1)*, | ||||||
|  |                         float addrspace(1)*, | ||||||
|  |                         float addrspace(1)*)* @kernel, metadata !"kernel", i32 1}% | ||||||
|  |  | ||||||
|  |  | ||||||
|  | To compile this kernel, we perform the following steps: | ||||||
|  |  | ||||||
|  | 1. Link with libdevice | ||||||
|  | 2. Internalize all but the public kernel function | ||||||
|  | 3. Run ``NVVMReflect`` and set ``__CUDA_FTZ`` to 0 | ||||||
|  | 4. Optimize the linked module | ||||||
|  | 5. Codegen the module | ||||||
|  |  | ||||||
|  |  | ||||||
|  | These steps can be performed by the LLVM ``llvm-link``, ``opt``, and ``llc`` | ||||||
|  | tools. In a complete compiler, these steps can also be performed entirely | ||||||
|  | programmatically by setting up an appropriate pass configuration (see | ||||||
|  | :ref:`libdevice`). | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   # llvm-link t2.bc libdevice.compute_20.10.bc -o t2.linked.bc | ||||||
|  |   # opt -internalize -internalize-public-api-list=kernel -nvvm-reflect-list=__CUDA_FTZ=0 -nvvm-reflect -O3 t2.linked.bc -o t2.opt.bc | ||||||
|  |   # llc -mcpu=sm_20 t2.opt.bc -o t2.ptx | ||||||
|  |  | ||||||
|  | .. note:: | ||||||
|  |  | ||||||
|  |   The ``-nvvm-reflect-list=_CUDA_FTZ=0`` is not strictly required, as any | ||||||
|  |   undefined variables will default to zero. It is shown here for evaluation | ||||||
|  |   purposes. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | This gives us the following PTX (excerpt): | ||||||
|  |  | ||||||
|  | .. code-block:: text | ||||||
|  |  | ||||||
|  |   // | ||||||
|  |   // Generated by LLVM NVPTX Back-End | ||||||
|  |   // | ||||||
|  |  | ||||||
|  |   .version 3.1 | ||||||
|  |   .target sm_20 | ||||||
|  |   .address_size 64 | ||||||
|  |  | ||||||
|  |     // .globl kernel | ||||||
|  |                                           // @kernel | ||||||
|  |   .visible .entry kernel( | ||||||
|  |     .param .u64 kernel_param_0, | ||||||
|  |     .param .u64 kernel_param_1, | ||||||
|  |     .param .u64 kernel_param_2 | ||||||
|  |   ) | ||||||
|  |   { | ||||||
|  |     .reg .pred  %p<30>; | ||||||
|  |     .reg .f32   %f<111>; | ||||||
|  |     .reg .s32   %r<21>; | ||||||
|  |     .reg .s64   %rl<8>; | ||||||
|  |  | ||||||
|  |   // BB#0:                                // %entry | ||||||
|  |     ld.param.u64  %rl2, [kernel_param_0]; | ||||||
|  |     mov.u32   %r3, %tid.x; | ||||||
|  |     ld.param.u64  %rl3, [kernel_param_1]; | ||||||
|  |     mul.wide.s32  %rl4, %r3, 4; | ||||||
|  |     add.s64   %rl5, %rl2, %rl4; | ||||||
|  |     ld.param.u64  %rl6, [kernel_param_2]; | ||||||
|  |     add.s64   %rl7, %rl3, %rl4; | ||||||
|  |     add.s64   %rl1, %rl6, %rl4; | ||||||
|  |     ld.global.f32   %f1, [%rl5]; | ||||||
|  |     ld.global.f32   %f2, [%rl7]; | ||||||
|  |     setp.eq.f32 %p1, %f1, 0f3F800000; | ||||||
|  |     setp.eq.f32 %p2, %f2, 0f00000000; | ||||||
|  |     or.pred   %p3, %p1, %p2; | ||||||
|  |     @%p3 bra  BB0_1; | ||||||
|  |     bra.uni   BB0_2; | ||||||
|  |   BB0_1: | ||||||
|  |     mov.f32   %f110, 0f3F800000; | ||||||
|  |     st.global.f32   [%rl1], %f110; | ||||||
|  |     ret; | ||||||
|  |   BB0_2:                                  // %__nv_isnanf.exit.i | ||||||
|  |     abs.f32   %f4, %f1; | ||||||
|  |     setp.gtu.f32  %p4, %f4, 0f7F800000; | ||||||
|  |     @%p4 bra  BB0_4; | ||||||
|  |   // BB#3:                                // %__nv_isnanf.exit5.i | ||||||
|  |     abs.f32   %f5, %f2; | ||||||
|  |     setp.le.f32 %p5, %f5, 0f7F800000; | ||||||
|  |     @%p5 bra  BB0_5; | ||||||
|  |   BB0_4:                                  // %.critedge1.i | ||||||
|  |     add.f32   %f110, %f1, %f2; | ||||||
|  |     st.global.f32   [%rl1], %f110; | ||||||
|  |     ret; | ||||||
|  |   BB0_5:                                  // %__nv_isinff.exit.i | ||||||
|  |  | ||||||
|  |     ... | ||||||
|  |  | ||||||
|  |   BB0_26:                                 // %__nv_truncf.exit.i.i.i.i.i | ||||||
|  |     mul.f32   %f90, %f107, 0f3FB8AA3B; | ||||||
|  |     cvt.rzi.f32.f32 %f91, %f90; | ||||||
|  |     mov.f32   %f92, 0fBF317200; | ||||||
|  |     fma.rn.f32  %f93, %f91, %f92, %f107; | ||||||
|  |     mov.f32   %f94, 0fB5BFBE8E; | ||||||
|  |     fma.rn.f32  %f95, %f91, %f94, %f93; | ||||||
|  |     mul.f32   %f89, %f95, 0f3FB8AA3B; | ||||||
|  |     // inline asm | ||||||
|  |     ex2.approx.ftz.f32 %f88,%f89; | ||||||
|  |     // inline asm | ||||||
|  |     add.f32   %f96, %f91, 0f00000000; | ||||||
|  |     ex2.approx.f32  %f97, %f96; | ||||||
|  |     mul.f32   %f98, %f88, %f97; | ||||||
|  |     setp.lt.f32 %p15, %f107, 0fC2D20000; | ||||||
|  |     selp.f32  %f99, 0f00000000, %f98, %p15; | ||||||
|  |     setp.gt.f32 %p16, %f107, 0f42D20000; | ||||||
|  |     selp.f32  %f110, 0f7F800000, %f99, %p16; | ||||||
|  |     setp.eq.f32 %p17, %f110, 0f7F800000; | ||||||
|  |     @%p17 bra   BB0_28; | ||||||
|  |   // BB#27: | ||||||
|  |     fma.rn.f32  %f110, %f110, %f108, %f110; | ||||||
|  |   BB0_28:                                 // %__internal_accurate_powf.exit.i | ||||||
|  |     setp.lt.f32 %p18, %f1, 0f00000000; | ||||||
|  |     setp.eq.f32 %p19, %f3, 0f3F800000; | ||||||
|  |     and.pred    %p20, %p18, %p19; | ||||||
|  |     @!%p20 bra  BB0_30; | ||||||
|  |     bra.uni   BB0_29; | ||||||
|  |   BB0_29: | ||||||
|  |     mov.b32    %r9, %f110; | ||||||
|  |     xor.b32   %r10, %r9, -2147483648; | ||||||
|  |     mov.b32    %f110, %r10; | ||||||
|  |   BB0_30:                                 // %__nv_powf.exit | ||||||
|  |     st.global.f32   [%rl1], %f110; | ||||||
|  |     ret; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user