/* This code uses nvptx inline assembly guarded with acc_on_device, which is not optimized away at -O0, and then confuses the target assembler. { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */ #include #include #define NUM_WORKERS 16 #define NUM_VECTORS 32 #define WIDTH 64 #define HEIGHT 32 #define WORK_ID(I,N) \ (acc_on_device (acc_device_nvidia) \ ? ({unsigned __r; \ __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (__r)); \ __r; }) : (I % N)) #define VEC_ID(I,N) \ (acc_on_device (acc_device_nvidia) \ ? ({unsigned __r; \ __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (__r)); \ __r; }) : (I % N)) #pragma acc routine worker void __attribute__ ((noinline)) WorkVec (int *ptr, int w, int h, int nw, int nv) { #pragma acc loop worker for (int i = 0; i < h; i++) #pragma acc loop vector for (int j = 0; j < w; j++) ptr[i*w + j] = (WORK_ID (i, nw) << 8) | VEC_ID(j, nv); } int DoWorkVec (int nw) { int ary[HEIGHT][WIDTH]; int err = 0; for (int ix = 0; ix != HEIGHT; ix++) for (int jx = 0; jx != WIDTH; jx++) ary[ix][jx] = 0xdeadbeef; printf ("spawning %d ...", nw); fflush (stdout); #pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary) { WorkVec ((int *)ary, WIDTH, HEIGHT, nw, NUM_VECTORS); } for (int ix = 0; ix != HEIGHT; ix++) for (int jx = 0; jx != WIDTH; jx++) { int exp = ((ix % nw) << 8) | (jx % NUM_VECTORS); if (ary[ix][jx] != exp) { printf ("\nary[%d][%d] = %#x expected %#x", ix, jx, ary[ix][jx], exp); err = 1; } } printf (err ? " failed\n" : " ok\n"); return err; } int main () { int err = 0; for (int W = 1; W <= NUM_WORKERS; W <<= 1) err |= DoWorkVec (W); return err; }