/* ~/LITEX/riscv64-unknown-elf-gcc-10.1.0-2020.08.2-x86_64-linux-ubuntu14/bin/riscv64-unknown-elf-gcc -Os -S blit_goblin.c -march=rv32ib -mabi=ilp32 -mstrict-align -fno-builtin-memset -nostdlib -ffreestanding -nostartfiles ~/LITEX/riscv64-unknown-elf-gcc-10.1.0-2020.08.2-x86_64-linux-ubuntu14/bin/riscv64-unknown-elf-gcc -Os -o blit -march=rv32ib -mabi=ilp32 -T blit_goblin.lds -nostartfiles blit_goblin.s ~/LITEX/riscv64-unknown-elf-gcc-10.1.0-2020.08.2-x86_64-linux-ubuntu14/bin/riscv64-unknown-elf-objcopy -O binary -j .text blit blit_goblin.raw */ #ifndef BASE_FB #define BASE_FB 0x8F800000 // FIXME : should be generated ; 2+ MiB of SDRAM as framebuffer #warning "Using default BASE_FB" #endif #if defined(GOBLIN_NUBUS) #define BASE_ROM 0xF0910000 // FIXME : should be generated ; 4-64 KiB of Wishbone ROM ? ; also in the LDS file ; also in the Vex config #define BASE_RAM 0xF0902000 // FIXME : should be generated : 4-64 KiB of Wishbone SRAM ? ; also in _start #define BASE_RAM_SIZE 0x00001000 // FIXME : should be generated : 4-64 KiB of Wishbone SRAM ? ; also in _start #define BASE_BT_REGS 0xF0900000 #define BASE_ACCEL_REGS 0xF0901000 #elif defined(GOBLIN_SBUS) #define BASE_ROM 0x00410000 // FIXME : should be generated ; 4-64 KiB of Wishbone ROM ? ; also in the LDS file ; also in the Vex config #define BASE_RAM 0x00420000 // FIXME : should be generated : 4-64 KiB of Wishbone SRAM ? ; also in _start #define BASE_RAM_SIZE 0x00001000 // FIXME : should be generated : 4-64 KiB of Wishbone SRAM ? ; also in _start #define BASE_BT_REGS 0x00200000 #define BASE_ACCEL_REGS 0x000c0000 #else #error "Must define GOBLIN_NUBUS or GOBLIN_SBUS" #endif //typedef void (*boot_t)(void); //typedef void (*start_t)(unsigned short, unsigned short, unsigned short, unsigned short, unsigned short, unsigned short, unsigned short, unsigned short); typedef unsigned int uint32_t; typedef volatile unsigned int u_int32_t; // X11 graphics functions #define GXclear 0x0 /* 0 */ #define GXand 0x1 /* src AND dst */ #define GXandReverse 0x2 /* src AND NOT dst */ #define GXcopy 0x3 /* src */ #define GXandInverted 0x4 /* NOT src AND dst */ #define GXnoop 0x5 /* dst */ #define GXxor 0x6 /* src XOR dst */ #define GXor 0x7 /* src OR dst */ #define GXnor 0x8 /* NOT src AND NOT dst */ #define GXequiv 0x9 /* NOT src XOR dst */ #define GXinvert 0xa /* NOT dst */ #define GXorReverse 0xb /* src OR NOT dst */ #define GXcopyInverted 0xc /* NOT src */ #define GXorInverted 0xd /* NOT src OR dst */ #define GXnand 0xe /* NOT src OR NOT dst */ #define GXset 0xf /* 1 */ // Xrender op #define PictOpClear (0x80 | 0x0) #define PictOpSrc (0x80 | 0x1) #define PictOpDst (0x80 | 0x2) #define PictOpOver (0x80 | 0x3) #define PictOpOverReverse (0x80 | 0x4) #define PictOpIn (0x80 | 0x5) #define PictOpInReverse (0x80 | 0x6) #define PictOpOut (0x80 | 0x7) #define PictOpOutReverse (0x80 | 0x8) #define PictOpAtop (0x80 | 0x9) #define PictOpAtopReverse (0x80 | 0xa) #define PictOpXor (0x80 | 0xb) #define PictOpAdd (0x80 | 0xc) #define PictOpSaturate (0x80 | 0xd) // custom, with 0x40 for 'flip src' #define PictOpFlipClear (0x80 | 0x40 | 0x0) #define PictOpFlipSrc (0x80 | 0x40 | 0x1) #define PictOpFlipDst (0x80 | 0x40 | 0x2) #define PictOpFlipOver (0x80 | 0x40 | 0x3) #define PictOpFlipOverReverse (0x80 | 0x40 | 0x4) #define PictOpFlipIn (0x80 | 0x40 | 0x5) #define PictOpFlipInReverse (0x80 | 0x40 | 0x6) #define PictOpFlipOut (0x80 | 0x40 | 0x7) #define PictOpFlipOutReverse (0x80 | 0x40 | 0x8) #define PictOpFlipAtop (0x80 | 0x40 | 0x9) #define PictOpFlipAtopReverse (0x80 | 0x40 | 0xa) #define PictOpFlipXor (0x80 | 0x40 | 0xb) #define PictOpFlipAdd (0x80 | 0x40 | 0xc) #define PictOpFlipSaturate (0x80 | 0x40 | 0xd) #define FUN_BLIT_BIT 0 // hardwired in goblin_accel.py #define FUN_FILL_BIT 1 // hardwired in goblin_accel.py #define FUN_PATT_BIT 2 // hardwired in goblin_accel.py #define FUN_RSMSK8DST32_BIT 3 // hardwired in goblin_accel.py #define FUN_RSRC32MSK32DST32_BIT 4 // hardwired in goblin_accel.py #define FUN_RSRC32DST32_BIT 5 // hardwired in goblin_accel.py #define FUN_DONE_BIT 31 #define FUN_BLIT (1<(b))?(a):(b)) #define imin(a,b) (((a)<(b))?(a):(b)) #define DEBUG #ifdef DEBUG #define SHOW_FUN(a) /* fbc->fbc_r5_status[0] = a */ #define SHOW_PC() /* SHOW_FUN(cmd); do { u_int32_t rd; asm volatile("auipc %[rd], 0" : [rd]"=r"(rd) ) ; fbc->fbc_r5_status[1] = rd; } while (0) */ #define SHOW_PC_2VAL(a, b) /* SHOW_PC(); fbc->fbc_r5_status[2] = a; fbc->fbc_r5_status[3] = b */ #else #define SHOW_FUN(a) #define SHOW_PC() #define SHOW_PC_2VAL(a, b) #endif /* need some way to have identifiable proc# and multiple struct control_blitter for //ism */ /* First need to set up essential C stuff like the stack */ /* maybe pass core-id as the first parameter (in a0) to everyone */ /* also need to figure out the non-coherent caches ... */ void from_reset(void) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; struct goblin_bt_regs* fbt = (struct goblin_bt_regs*)BASE_BT_REGS; unsigned int cmd = fbc->reg_r5_cmd; unsigned char depth = fbc->reg_depth; unsigned char op = fbc->reg_op; uint32_t srcx, wi, dstx; if (depth == 0) { #if defined(GOBLIN_NUBUS) switch ((fbt->mode>>24) & 0xFF) // mode is 8 bits wrong-endian (all fbt is wrong-endian in NuBus version) #elif defined(GOBLIN_SBUS) switch (fbt->mode & 0xFF) #else #error "Must define GOBLIN_NUBUS or GOBLIN_SBUS" #endif { case mode_32bit: depth = 32; break; case mode_16bit: depth = 16; break; default: case mode_8bit: depth = 8; break; case mode_4bit: depth = 4; break; case mode_2bit: depth = 2; break; case mode_1bit: depth = 1; break; } } switch (depth) { case 32: srcx = fbc->reg_bitblt_src_x << 2; wi = fbc->reg_width << 2; dstx = fbc->reg_bitblt_dst_x << 2; break; case 16: srcx = fbc->reg_bitblt_src_x << 1; wi = fbc->reg_width << 1; dstx = fbc->reg_bitblt_dst_x << 1; break; default: case 8: srcx = fbc->reg_bitblt_src_x; wi = fbc->reg_width; dstx = fbc->reg_bitblt_dst_x; break; case 4: srcx = fbc->reg_bitblt_src_x >> 1; wi = fbc->reg_width >> 1; dstx = fbc->reg_bitblt_dst_x >> 1; break; case 2: srcx = fbc->reg_bitblt_src_x >> 2; wi = fbc->reg_width >> 2; dstx = fbc->reg_bitblt_dst_x >> 2; break; case 1: srcx = fbc->reg_bitblt_src_x >> 3; wi = fbc->reg_width >> 3; dstx = fbc->reg_bitblt_dst_x >> 3; break; } switch (cmd & 0xFF) { case FUN_BLIT: { bitblit(srcx, fbc->reg_bitblt_src_y, wi , fbc->reg_height, dstx, fbc->reg_bitblt_dst_y, 0xFF, op, // FIXME: re-add planemask support for X11 ops fbc->reg_src_ptr ? (unsigned char*)fbc->reg_src_ptr : (unsigned char*)BASE_FB, fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB, fbc->reg_src_stride, fbc->reg_dst_stride); // assumed to be scaled already } break; case FUN_FILL: { rectfill(dstx, fbc->reg_bitblt_dst_y, wi , fbc->reg_height, fbc->reg_fgcolor, fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB, fbc->reg_dst_stride); // assumed to be scaled already } break; case FUN_PATT: { patternrectfill(dstx, fbc->reg_bitblt_dst_y, wi , fbc->reg_height, (unsigned char*)BASE_FB + (8*1024*1024) - (64*1024), // FIXME fbc->reg_bitblt_src_x, // unscaled fbc->reg_bitblt_src_y, // unscaled fbc->reg_src_stride, fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB, fbc->reg_dst_stride); // assumed to be scaled already } break; case FUN_RSMSK8DST32: { bitblit_solid_msk8_dst32_fwd_fwd(op, fbc->reg_bitblt_msk_x, // unscaled, 8 bits fbc->reg_bitblt_msk_y, fbc->reg_width, // NOT scaled here, we assume depth == 32 here fbc->reg_height, dstx, // still scaled for the PTR calculation ... fbc->reg_bitblt_dst_y, fbc->reg_fgcolor, fbc->reg_msk_ptr ? (unsigned char*)fbc->reg_msk_ptr : (unsigned char*)BASE_FB, fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB, fbc->reg_msk_stride, // assumed to be scaled already fbc->reg_dst_stride); // assumed to be scaled already } break; case FUN_RSRC32MSK32DST32: { bitblit_src32_msk32_dst32_fwd_fwd(op, srcx, // still scaled for the PTR calculation ... fbc->reg_bitblt_src_y, fbc->reg_bitblt_msk_x << 2, // assume 32 bits // still scaled for the PTR calculation ... fbc->reg_bitblt_msk_y, fbc->reg_width, // NOT scaled here, we assume depth == 32 here fbc->reg_height, dstx, // still scaled for the PTR calculation ... fbc->reg_bitblt_dst_y, fbc->reg_src_ptr ? (unsigned char*)fbc->reg_src_ptr : (unsigned char*)BASE_FB, fbc->reg_msk_ptr ? (unsigned char*)fbc->reg_msk_ptr : (unsigned char*)BASE_FB, fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB, fbc->reg_src_stride, // assumed to be scaled already fbc->reg_msk_stride, // assumed to be scaled already fbc->reg_dst_stride); // assumed to be scaled already } break; case FUN_RSRC32DST32: { bitblit_src32_dst32_fwd_fwd(op, srcx, fbc->reg_bitblt_src_y, fbc->reg_width, // NOT scaled here, we assume depth == 32 here fbc->reg_height, dstx, // still scaled for the PTR calculation ... fbc->reg_bitblt_dst_y, fbc->reg_src_ptr ? (unsigned char*)fbc->reg_src_ptr : (unsigned char*)BASE_FB, fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB, fbc->reg_src_stride, // assumed to be scaled already fbc->reg_dst_stride); // assumed to be scaled already } break; default: break; } finish: // make sure we have nothing left in the cache flush_cache(); fbc->reg_r5_cmd = FUN_DONE; done: /* wait for reset */ goto done; } #define bitblit_proto_int(a, b, suf) \ static void bitblit##a##b##suf(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ const unsigned char pm, \ unsigned char *src_ptr, \ unsigned char *dst_ptr, \ const unsigned_param_type src_stride , \ const unsigned_param_type dst_stride \ ) #define bitblit_proto(suf) \ bitblit_proto_int(_fwd, _fwd, suf); \ bitblit_proto_int(_bwd, _fwd, suf); \ bitblit_proto_int(_fwd, _bwd, suf) // bitblit_proto_int(_bwd, _bwd, suf); bitblit_proto(_copy); bitblit_proto(_xor); bitblit_proto(_copy_pm); bitblit_proto(_xor_pm); bitblit_proto(_radd); #define ROUTE_BITBLIT_PM(pm, bb) \ if (pm == 0xFF) bb(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride); \ else bb##_pm(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride) static void bitblit(const unsigned_param_type xs, const unsigned_param_type ys, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type xd, const unsigned_param_type yd, const unsigned char pm, const unsigned char gxop, unsigned char *src_ptr, unsigned char *dst_ptr, const unsigned_param_type src_stride, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; if (ys > yd) { switch(gxop) { case GXcopy: ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_copy); break; case GXxor: ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_xor); break; case PictOpAdd: bitblit_fwd_fwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride); break; } } else if (ys < yd) { switch(gxop) { case GXcopy: ROUTE_BITBLIT_PM(pm, bitblit_bwd_fwd_copy); break; case GXxor: ROUTE_BITBLIT_PM(pm, bitblit_bwd_fwd_xor); break; case PictOpAdd: bitblit_bwd_fwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride); break; } } else { // ys == yd if (xs > xd) { switch(gxop) { case GXcopy: ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_copy); break; case GXxor: ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_xor); break; case PictOpAdd: bitblit_fwd_fwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride); break; } } else if (xs < xd) { switch(gxop) { case GXcopy: ROUTE_BITBLIT_PM(pm, bitblit_fwd_bwd_copy); break; case GXxor: ROUTE_BITBLIT_PM(pm, bitblit_fwd_bwd_xor); break; case PictOpAdd: bitblit_fwd_bwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride); break; } } else { // xs == xd switch(gxop) { case GXcopy: /* don't bother */ break; case GXxor: rectfill_pm(xd, yd, wi, re, 0, pm, dst_ptr, dst_stride); break; } } } } static void rectfill(const unsigned_param_type xd, const unsigned_param_type yd, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type color, unsigned char* dst_ptr, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; unsigned int i, j; unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *dptr_line = dptr; unsigned char u8color = color & 0xFF; for (j = 0 ; j < re ; j++) { unsigned char *dptr_elt = dptr_line; i = 0; for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { *dptr_elt = u8color; dptr_elt ++; } if (wi > 3) { if ((wi>15) && (((unsigned int)dptr_elt&0x7)==0)) { register unsigned int s8 asm("s8"); register unsigned int s9 asm("s9"); s8 = color; s9 = color; for ( ; i < (wi-15) ; i+=16) { _custom_sd(dptr_elt, 0, 0, s8, s9); _custom_sd(dptr_elt, 8, 0, s8, s9); dptr_elt += 16; } } for ( ; i < (wi-3) ; i+=4) { *(unsigned int*)dptr_elt = color; dptr_elt +=4; } } for ( ; i < wi ; i++) { *dptr_elt = u8color; dptr_elt ++; } dptr_line += dst_stride; } } static void rectfill_pm(const unsigned_param_type xd, const unsigned_param_type yd, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type color, const unsigned char pm, unsigned char* dst_ptr, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; unsigned int i, j; unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *dptr_line = dptr; unsigned char u8color = color; for (j = 0 ; j < re ; j++) { unsigned char *dptr_elt = dptr_line; i = 0; for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { *dptr_elt = (u8color & pm) | (*dptr_elt & ~pm); dptr_elt ++; } if (wi > 3) { unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; for ( ; i < (wi-3) ; i+=4) { *(unsigned int*)dptr_elt = (color & u32pm) | (*(unsigned int*)dptr_elt & ~u32pm); dptr_elt +=4; } } for ( ; i < wi ; i++) { *dptr_elt = (u8color & pm) | (*dptr_elt & ~pm); dptr_elt ++; } dptr_line += dst_stride; } } static void xorrectfill(const unsigned_param_type xd, const unsigned_param_type yd, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type color, unsigned char* dst_ptr, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; unsigned int i, j; unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *dptr_line = dptr; unsigned char u8color = color & 0xFF; for (j = 0 ; j < re ; j++) { unsigned char *dptr_elt = dptr_line; i = 0; for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { *dptr_elt ^= u8color; dptr_elt ++; } if (wi > 3) { for ( ; i < (wi-3) ; i+=4) { *(unsigned int*)dptr_elt ^= color; dptr_elt +=4; } } for ( ; i < wi ; i++) { *dptr_elt ^= u8color; dptr_elt ++; } dptr_line += dst_stride; } } static void xorrectfill_pm(const unsigned_param_type xd, const unsigned_param_type yd, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type color, const unsigned char pm, unsigned char* dst_ptr, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; unsigned int i, j; unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *dptr_line = dptr; unsigned char u8color = color; for (j = 0 ; j < re ; j++) { unsigned char *dptr_elt = dptr_line; i = 0; for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { *dptr_elt ^= (u8color & pm); dptr_elt ++; } if (wi > 3) { unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; for ( ; i < (wi-3) ; i+=4) { *(unsigned int*)dptr_elt ^= (color & u32pm); dptr_elt +=4; } } for ( ; i < wi ; i++) { *dptr_elt ^= (u8color & pm); dptr_elt ++; } dptr_line += dst_stride; } } static void invert(const unsigned_param_type xd, const unsigned_param_type yd, const unsigned_param_type wi, const unsigned_param_type re, unsigned char* dst_ptr, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; unsigned int i, j; unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *dptr_line = dptr; for (j = 0 ; j < re ; j++) { unsigned char *dptr_elt = dptr_line; i = 0; for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { *dptr_elt = ~(*dptr_elt); dptr_elt ++; } if (wi > 3) { for ( ; i < (wi-3) ; i+=4) { *(unsigned int*)dptr_elt = ~(*(unsigned int*)dptr_elt); dptr_elt +=4; } } for ( ; i < wi ; i++) { *dptr_elt = ~(*dptr_elt); dptr_elt ++; } dptr_line += dst_stride; } } // X11 // NOT using npm enables the use of 'cmix' in more cases #define COPY(d,s,pm,npm) (d) = (s) //#define COPY_PM(d,s,pm,npm) (d) = (((s) & (pm)) | ((d) & (npm))) #define COPY_PM(d,s,pm,npm) (d) = (((s) & (pm)) | ((d) & (~pm))) #define XOR(d,s,pm,npm) (d) = ((s) ^ (d)) //#define XOR_PM(d,s,pm,npm) (d) = ((((s) ^ (d)) & (pm)) | ((d) & (npm))) #define XOR_PM(d,s,pm,npm) (d) = ((((s) ^ (d)) & (pm)) | ((d) & (~pm))) // Xrender #define RADD(d,s,pm,npm) (d) = ukadd8((d), (s)) #define BLIT_FWD_FWD(NAME, OP) \ static void bitblit_fwd_fwd_##NAME(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ const unsigned char pm, \ unsigned char* src_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type dst_stride) { \ unsigned int i, j; \ unsigned char *sptr = (src_ptr + (ys * src_stride) + xs); \ unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); \ unsigned char *sptr_line = sptr; \ unsigned char *dptr_line = dptr; \ /*const unsigned char npm = ~pm;*/ \ \ for (j = 0 ; j < re ; j++) { \ unsigned char *sptr_elt = sptr_line; \ unsigned char *dptr_elt = dptr_line; \ i = 0; \ if (wi>3) { \ if ((xs & 0x3) || (xd & 0x3)) { \ for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { \ OP(*dptr_elt, *sptr_elt, pm, npm); \ dptr_elt ++; \ sptr_elt ++; \ } \ unsigned char *sptr_elt_al = (unsigned char*)((unsigned int)sptr_elt & ~0x3); \ unsigned int fsr_cst = 8*((unsigned int)sptr_elt & 0x3); \ unsigned int src0 = ((unsigned int*)sptr_elt_al)[0]; \ unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; \ for ( ; i < (wi-3) ; i+=4) { \ unsigned int src1 = ((unsigned int*)sptr_elt_al)[1]; \ unsigned int val; \ asm("fsr %0, %1, %2, %3\n" : "=r"(val) : "r"(src0), "r"(src1), "r"(fsr_cst)); \ OP(*(unsigned int*)dptr_elt, val, u32pm, u32npm); \ src0 = src1; \ dptr_elt += 4; \ sptr_elt_al += 4; \ } \ sptr_elt = sptr_elt_al + ((unsigned int)sptr_elt & 0x3); \ } else { \ const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; \ /*const unsigned int u32npm = (unsigned int)npm | ((unsigned int)npm)<<8 | ((unsigned int)npm)<<16 | ((unsigned int)npm)<<24;*/ \ if (((xs & 0xf) == 0) && ((xd & 0xf) == 0)) { \ for ( ; i < (wi&(~0xf)) ; i+= 16) { \ OP(((unsigned int*)dptr_elt)[0], ((unsigned int*)sptr_elt)[0], u32pm, u32npm); \ OP(((unsigned int*)dptr_elt)[1], ((unsigned int*)sptr_elt)[1], u32pm, u32npm); \ OP(((unsigned int*)dptr_elt)[2], ((unsigned int*)sptr_elt)[2], u32pm, u32npm); \ OP(((unsigned int*)dptr_elt)[3], ((unsigned int*)sptr_elt)[3], u32pm, u32npm); \ dptr_elt += 16; \ sptr_elt += 16; \ } \ } \ for ( ; i < (wi&(~3)) ; i+= 4) { \ OP(((unsigned int*)dptr_elt)[0], ((unsigned int*)sptr_elt)[0], u32pm, u32npm); \ dptr_elt += 4; \ sptr_elt += 4; \ } \ } \ } \ for ( ; i < wi ; i++) { \ OP(*dptr_elt, *sptr_elt, pm, npm); \ dptr_elt ++; \ sptr_elt ++; \ } \ sptr_line += src_stride; \ dptr_line += dst_stride; \ } \ } #define BLIT_FWD_BWD(NAME, OP) \ static void bitblit_fwd_bwd_##NAME(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ const unsigned char pm, \ unsigned char* src_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type dst_stride) { \ unsigned int i, j; \ unsigned char *sptr = (src_ptr + (ys * src_stride) + xs); \ unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); \ unsigned char *sptr_line = sptr + wi - 1; \ unsigned char *dptr_line = dptr + wi - 1; \ const unsigned char npm = ~pm; \ \ for (j = 0 ; j < re ; j++) { \ unsigned char *sptr_elt = sptr_line; \ unsigned char *dptr_elt = dptr_line; \ for (i = 0 ; i < wi ; i++) { \ OP(*dptr_elt, *sptr_elt, pm, npm); \ dptr_elt --; \ sptr_elt --; \ } \ sptr_line += src_stride; \ dptr_line += dst_stride; \ } \ } #define BLIT_BWD_FWD(NAME, OP) \ static void bitblit_bwd_fwd_##NAME(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ const unsigned char pm, \ unsigned char* src_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type dst_stride) { \ unsigned int i, j; \ unsigned char *sptr = (src_ptr + (ys * src_stride) + xs); \ unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); \ unsigned char *sptr_line = sptr + ((re-1) * src_stride); \ unsigned char *dptr_line = dptr + ((re-1) * dst_stride); \ const unsigned char npm = ~pm; \ \ for (j = 0 ; j < re ; j++) { \ unsigned char *sptr_elt = sptr_line; \ unsigned char *dptr_elt = dptr_line; \ i = 0; \ if (wi>3) { \ if ((xs & 0x3) || (xd & 0x3)) { \ for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { \ OP(*dptr_elt, *sptr_elt, pm, npm); \ dptr_elt ++; \ sptr_elt ++; \ } \ unsigned char *sptr_elt_al = (unsigned char*)((unsigned int)sptr_elt & ~0x3); \ unsigned int fsr_cst = 8*((unsigned int)sptr_elt & 0x3); \ unsigned int src0 = ((unsigned int*)sptr_elt_al)[0]; \ unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; \ for ( ; i < (wi-3) ; i+=4) { \ unsigned int src1 = ((unsigned int*)sptr_elt_al)[1]; \ unsigned int val; \ asm("fsr %0, %1, %2, %3\n" : "=r"(val) : "r"(src0), "r"(src1), "r"(fsr_cst)); \ OP(*(unsigned int*)dptr_elt, val, u32pm, u32npm); \ src0 = src1; \ dptr_elt += 4; \ sptr_elt_al += 4; \ } \ sptr_elt = sptr_elt_al + ((unsigned int)sptr_elt & 0x3); \ } else { \ if (((xs & 0xf) == 0) && ((xd & 0xf) == 0)) { \ for ( ; i < (wi&(~0xf)) ; i+= 16) { \ const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; \ /*const unsigned int u32npm = (unsigned int)npm | ((unsigned int)npm)<<8 | ((unsigned int)npm)<<16 | ((unsigned int)npm)<<24;*/ \ OP(((unsigned int*)dptr_elt)[0], ((unsigned int*)sptr_elt)[0], u32pm, u32npm); \ OP(((unsigned int*)dptr_elt)[1], ((unsigned int*)sptr_elt)[1], u32pm, u32npm); \ OP(((unsigned int*)dptr_elt)[2], ((unsigned int*)sptr_elt)[2], u32pm, u32npm); \ OP(((unsigned int*)dptr_elt)[3], ((unsigned int*)sptr_elt)[3], u32pm, u32npm); \ dptr_elt += 16; \ sptr_elt += 16; \ } \ } \ if (((xs & 0x3) == 0) && ((xd & 0x3) == 0)) { \ for ( ; i < (wi&(~3)) ; i+= 4) { \ const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; \ /*const unsigned int u32npm = (unsigned int)npm | ((unsigned int)npm)<<8 | ((unsigned int)npm)<<16 | ((unsigned int)npm)<<24;*/ \ OP(((unsigned int*)dptr_elt)[0], ((unsigned int*)sptr_elt)[0], u32pm, u32npm); \ dptr_elt += 4; \ sptr_elt += 4; \ } \ } \ } \ } \ for ( ; i < wi ; i++) { \ OP(*dptr_elt, *sptr_elt, pm, npm); \ dptr_elt ++; \ sptr_elt ++; \ } \ sptr_line -= src_stride; \ dptr_line -= dst_stride; \ } \ } #define BLIT_ALLDIR(NAME, OP) \ BLIT_FWD_FWD(NAME, OP) \ BLIT_FWD_BWD(NAME, OP) \ BLIT_BWD_FWD(NAME, OP) \ #define BLIT_NOTALLDIR(NAME, OP) \ BLIT_FWD_BWD(NAME, OP) \ BLIT_BWD_FWD(NAME, OP) \ //BLIT_ALLDIR(copy, COPY) BLIT_NOTALLDIR(copy, COPY) BLIT_ALLDIR(xor, XOR) BLIT_ALLDIR(copy_pm, COPY_PM) BLIT_ALLDIR(xor_pm, XOR_PM) BLIT_ALLDIR(radd, RADD) static void bitblit_fwd_fwd_copy(const unsigned_param_type xs, const unsigned_param_type ys, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type xd, const unsigned_param_type yd, const unsigned char pm, unsigned char* src_ptr, unsigned char* dst_ptr, const unsigned_param_type src_stride, const unsigned_param_type dst_stride) { unsigned int j; unsigned char *sptr = (src_ptr + (ys * src_stride) + xs); unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *sptr_line = sptr; unsigned char *dptr_line = dptr; /*const unsigned char npm = ~pm;*/ for (j = 0 ; j < re ; j++) { register unsigned char *sptr_elt = sptr_line; unsigned char *dptr_elt = dptr_line; const unsigned char *dptr_elt_last = dptr_line + wi; if (wi>3) { if ((xs & 0x3) != (xd & 0x3)) { /* align dest, we'll deal with src via shift realignement using fsr */ for ( ; (dptr_elt < dptr_elt_last) && ((unsigned int)dptr_elt&0x3)!=0; ) { dptr_elt[0] = sptr_elt[0]; dptr_elt ++; sptr_elt ++; } unsigned char *sptr_elt_al = (unsigned char*)((unsigned int)sptr_elt & ~0x3); unsigned int fsr_cst = 8*((unsigned int)sptr_elt & 0x3); unsigned int src0 = ((unsigned int*)sptr_elt_al)[0]; unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; /* handle unaligned src */ for ( ; (dptr_elt < (dptr_elt_last-3)) ; ) { unsigned int src1 = ((unsigned int*)sptr_elt_al)[1]; unsigned int val; asm("fsr %0, %1, %2, %3\n" : "=r"(val) : "r"(src0), "r"(src1), "r"(fsr_cst)); ((unsigned int*)dptr_elt)[0] = val; src0 = src1; dptr_elt += 4; sptr_elt_al += 4; } sptr_elt = sptr_elt_al + ((unsigned int)sptr_elt & 0x3); } else if ((xs & 0x7) != (xd & 0x7)) { /* off-hy-4, can't use 64 ld/sd directly (could pipeline the 32-bits data) but still can use 32-bits */ const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; const unsigned char* dptr_elt_end = dptr_elt + wi; /* align dest & src (they are aligned the same here up to 0x3) */ for ( ; (dptr_elt < dptr_elt_last) && ((unsigned int)dptr_elt&0x3)!=0; ) { dptr_elt[0] = sptr_elt[0]; dptr_elt ++; sptr_elt ++; } for ( ; (dptr_elt < (dptr_elt_last-3)) ; ) { ((unsigned int*)dptr_elt)[0] = ((unsigned int*)sptr_elt)[0]; dptr_elt += 4; sptr_elt += 4; } } else { const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24; const unsigned char* dptr_elt_end = dptr_elt + wi; /* align dest & src (they are aligned the same here) */ for ( ; (dptr_elt < dptr_elt_last) && ((unsigned int)dptr_elt&0x3)!=0; ) { dptr_elt[0] = sptr_elt[0]; dptr_elt ++; sptr_elt ++; } /* align to 8 for ls/sd */ for ( ; (dptr_elt < (dptr_elt_last-3)) && ((unsigned int)dptr_elt&0x7)!=0;) { ((unsigned int*)dptr_elt)[0] = ((unsigned int*)sptr_elt)[0]; dptr_elt += 4; sptr_elt += 4; } #if 0 for ( ; (dptr_elt < (dptr_elt_last-31)) ; ) { register unsigned int s4 asm("s4"); register unsigned int s5 asm("s5"); register unsigned int s6 asm("s6"); register unsigned int s7 asm("s7"); register unsigned int s8 asm("s8"); register unsigned int s9 asm("s9"); register unsigned int s10 asm("s10"); register unsigned int s11 asm("s11"); _custom_ld(sptr_elt, 0, s4, s5); _custom_ld(sptr_elt, 16, s8, s9); _custom_ld(sptr_elt, 8, s6, s7); _custom_sd(dptr_elt, 0, 0, s4, s5); _custom_sd(dptr_elt, 8, 0, s6, s7); _custom_ld(sptr_elt, 24, s10, s11); _custom_sd(dptr_elt, 16, 0, s8, s9); sptr_elt += 32; _custom_sd(dptr_elt, 24, 0, s10, s11); dptr_elt += 32; } #endif for ( ; (dptr_elt < (dptr_elt_last-15)) ; ) { register unsigned int s8 asm("s8"); register unsigned int s9 asm("s9"); register unsigned int s10 asm("s10"); register unsigned int s11 asm("s11"); _custom_ld(sptr_elt, 0, s8, s9); _custom_ld(sptr_elt, 8, s10, s11); _custom_sd(dptr_elt, 0, 0, s8, s9); sptr_elt += 16; _custom_sd(dptr_elt, 8, 0, s10, s11); dptr_elt += 16; } #if 0 for ( ; (dptr_elt < (dptr_elt_last-7)) ; ) { register unsigned int s8 asm("s8"); register unsigned int s9 asm("s9"); _custom_ld(sptr_elt, 0, s8, s9); _custom_sd(dptr_elt, 0, 0, s8, s9); sptr_elt += 8; dptr_elt += 8; } #endif for ( ; (dptr_elt < (dptr_elt_last-3)) ; ) { ((unsigned int*)dptr_elt)[0] = ((unsigned int*)sptr_elt)[0]; dptr_elt += 4; sptr_elt += 4; } } } /* common tail loop */ for ( ; dptr_elt < dptr_elt_last ; ) { dptr_elt[0] = sptr_elt[0]; dptr_elt ++; sptr_elt ++; } sptr_line += src_stride; dptr_line += dst_stride; } } static void patternrectfill(const unsigned_param_type xd, const unsigned_param_type yd, const unsigned_param_type wi, const unsigned_param_type re, unsigned char *pat_ptr, const unsigned_param_type pat_xmask, const unsigned_param_type pat_ymask, const unsigned_param_type pat_stride, unsigned char* dst_ptr, const unsigned_param_type dst_stride ) { struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS; unsigned int i, j; unsigned int io, jo; unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); unsigned char *dptr_line = dptr; unsigned char *pat_ptr_line; io = xd & pat_xmask; jo = yd & pat_ymask; pat_ptr_line = pat_ptr + (jo & pat_ymask) * pat_stride; for (j = 0 ; j < re ; j++) { unsigned char *dptr_elt = dptr_line; i = 0; for ( ; i < wi && ((unsigned int)dptr_elt&0x3)!=0; i++) { dptr_elt[0] = pat_ptr_line[(i+io) & pat_xmask]; dptr_elt ++; } if (wi > 3) { unsigned int fsr_cst = 8*((i+io) & 0x3); unsigned int src0 = ((unsigned int*)pat_ptr_line)[((i+io) & pat_xmask) >> 2]; for ( ; i < (wi-3) ; i+=4) { unsigned int src1 = ((unsigned int*)pat_ptr_line)[((i+io+4) & pat_xmask) >> 2]; unsigned int val; asm("fsr %0, %1, %2, %3\n" : "=r"(val) : "r"(src0), "r"(src1), "r"(fsr_cst)); ((unsigned int*)dptr_elt)[0] = val; src0 = src1; dptr_elt += 4; } } for ( ; i < wi ; i++) { dptr_elt[0] = pat_ptr_line[(i+io) & pat_xmask]; dptr_elt ++; } dptr_line += dst_stride; pat_ptr_line = pat_ptr + ((j+jo) & pat_ymask) * pat_stride; } } #define bitblit_render_proto(a, b, suf) \ static void bitblit_solid_msk8_dst32##a##b##suf(const unsigned_param_type xm, \ const unsigned_param_type ym, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ const unsigned int fgcolor, \ unsigned char* msk_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type msk_stride, \ const unsigned_param_type dst_stride); \ static void bitblit_src32_msk32_dst32##a##b##suf(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type xm, \ const unsigned_param_type ym, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ unsigned char* src_ptr, \ unsigned char* msk_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type msk_stride, \ const unsigned_param_type dst_stride); \ static void bitblit_src32_dst32##a##b##suf(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ unsigned char* src_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type dst_stride); bitblit_render_proto(_fwd, _fwd, _over) bitblit_render_proto(_fwd, _fwd, _fover) bitblit_render_proto(_fwd, _fwd, _outreverse) static void bitblit_solid_msk8_dst32_fwd_fwd(const unsigned char op, const unsigned_param_type xm, const unsigned_param_type ym, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type xd, const unsigned_param_type yd, const unsigned int fgcolor, unsigned char* msk_ptr, unsigned char* dst_ptr, const unsigned_param_type msk_stride, const unsigned_param_type dst_stride) { switch (op) { case PictOpOver: bitblit_solid_msk8_dst32_fwd_fwd_over(xm, ym, wi, re, xd, yd, fgcolor, msk_ptr, dst_ptr, msk_stride, dst_stride); break; /* case PictOpOutReverse: */ /* bitblit_solid_msk8_dst32_fwd_fwd_outreverse(xm, ym, wi, re, xd, yd, fgcolor, msk_ptr, dst_ptr, msk_stride, dst_stride); */ /* break; */ default: break; } } static void bitblit_src32_msk32_dst32_fwd_fwd(const unsigned char op, const unsigned_param_type xs, const unsigned_param_type ys, const unsigned_param_type xm, const unsigned_param_type ym, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type xd, const unsigned_param_type yd, unsigned char* src_ptr, unsigned char* msk_ptr, unsigned char* dst_ptr, const unsigned_param_type src_stride, const unsigned_param_type msk_stride, const unsigned_param_type dst_stride) { switch (op) { case PictOpOver: bitblit_src32_msk32_dst32_fwd_fwd_over(xs, ys, xm, ym, wi, re, xd, yd, src_ptr, msk_ptr, dst_ptr, src_stride, msk_stride, dst_stride); break; case PictOpFlipOver: bitblit_src32_msk32_dst32_fwd_fwd_fover(xs, ys, xm, ym, wi, re, xd, yd, src_ptr, msk_ptr, dst_ptr, src_stride, msk_stride, dst_stride); break; default: break; } } static void bitblit_src32_dst32_fwd_fwd(const unsigned char op, const unsigned_param_type xs, const unsigned_param_type ys, const unsigned_param_type wi, const unsigned_param_type re, const unsigned_param_type xd, const unsigned_param_type yd, unsigned char* src_ptr, unsigned char* dst_ptr, const unsigned_param_type src_stride, const unsigned_param_type dst_stride) { switch (op) { case PictOpOver: bitblit_src32_dst32_fwd_fwd_over(xs, ys, wi, re, xd, yd, src_ptr, dst_ptr, src_stride, dst_stride); break; case PictOpFlipOver: bitblit_src32_dst32_fwd_fwd_fover(xs, ys, wi, re, xd, yd, src_ptr, dst_ptr, src_stride, dst_stride); break; default: break; } } // Xrender //#define TROVER(d,m,s) (d) = (m)*(s) + (d)*(0xff ^ (m))) #define TROVERl(d,m,s) (d) = ufma8vlv((s), (m), ufma8vlv((d), (0xffffffff^(m)), 0)) #define TROVERl4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3) \ (d0) = ufma8vlv((d0), (0xffffffff^(m0)), 0); \ (d1) = ufma8vlv((d1), (0xffffffff^(m1)), 0); \ (d2) = ufma8vlv((d2), (0xffffffff^(m2)), 0); \ (d3) = ufma8vlv((d3), (0xffffffff^(m3)), 0); \ (d0) = ufma8vlv((s0), (m0), (d0)); \ (d1) = ufma8vlv((s1), (m1), (d1)); \ (d2) = ufma8vlv((s2), (m2), (d2)); \ (d3) = ufma8vlv((s3), (m3), (d3)) #define TROVERh(d,m,s) (d) = ufma8vhv((s), (m), ufma8vhv((d), (0xffffffff^(m)), 0)) #define TROVERh4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3) \ (d0) = ufma8vhv((d0), (0xffffffff^(m0)), 0); \ (d1) = ufma8vhv((d1), (0xffffffff^(m1)), 0); \ (d2) = ufma8vhv((d2), (0xffffffff^(m2)), 0); \ (d3) = ufma8vhv((d3), (0xffffffff^(m3)), 0); \ (d0) = ufma8vhv((s0), (m0), (d0)); \ (d1) = ufma8vhv((s1), (m1), (d1)); \ (d2) = ufma8vhv((s2), (m2), (d2)); \ (d3) = ufma8vhv((s3), (m3), (d3)) /* 3210 0321 // fsr by 8 ; could be rot 1230 // rev8 */ static inline uint32_t pixelswap(const uint32_t p) { /* uint32_t r = __builtin_bswap32(p); */ /* asm("fsr %0, %1, %2, %3\n" : "=r"(r) : "r"(r), "r"(r), "r"(8)); */ uint32_t r; asm("fsr %0, %1, %2, %3\n" : "=r"(r) : "r"(p), "r"(p), "r"(8)); return __builtin_bswap32(r); } #define TRFOVERh(d,m,s) (d) = (ufma8vlv(pixelswap(s), (m), ufma8vlv((d), (0xffffffff^(m)), 0))) #define TRFOVERh4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3) \ (d0) = ufma8vlv((d0), (0xffffffff^(m0)), 0); \ (d1) = ufma8vlv((d1), (0xffffffff^(m1)), 0); \ (d2) = ufma8vlv((d2), (0xffffffff^(m2)), 0); \ (d3) = ufma8vlv((d3), (0xffffffff^(m3)), 0); \ (d0) = (ufma8vlv(pixelswap(s0), (m0), (d0))); \ (d1) = (ufma8vlv(pixelswap(s1), (m1), (d1))); \ (d2) = (ufma8vlv(pixelswap(s2), (m2), (d2))); \ (d3) = (ufma8vlv(pixelswap(s3), (m3), (d3))) #define TROUTREVl(d,m,s) (d) = ufma8vlv((d), (0xffffffff^(m)), 0) #define TROUTREVl4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3) \ (d0) = ufma8vlv((d0), (0xffffffff^(m0)), 0); \ (d1) = ufma8vlv((d1), (0xffffffff^(m1)), 0); \ (d2) = ufma8vlv((d2), (0xffffffff^(m2)), 0); \ (d3) = ufma8vlv((d3), (0xffffffff^(m3)), 0) #define BLITSM8D32_FWD_FWD(NAME, TOP, TOP4) \ static void bitblit_solid_msk8_dst32_fwd_fwd_##NAME(const unsigned_param_type xm, \ const unsigned_param_type ym, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ const unsigned int fgcolor, \ unsigned char* msk_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type msk_stride, \ const unsigned_param_type dst_stride) { \ unsigned int i, j; \ unsigned char *mptr = (msk_ptr + (ym * msk_stride) + xm); \ unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); \ unsigned char *mptr_line = mptr; \ unsigned char *dptr_line = dptr; \ for (j = 0 ; j < re ; j++) { \ unsigned char *mptr_elt = mptr_line; \ unsigned int *dptr_elt = (unsigned int*)dptr_line; \ i = 0; \ if (wi > 3) for ( ; i < (wi-3) ; i+= 4) { \ unsigned char m0 = *(mptr_elt+0); \ unsigned char m1 = *(mptr_elt+1); \ unsigned char m2 = *(mptr_elt+2); \ unsigned char m3 = *(mptr_elt+3); \ unsigned int d0 = *(dptr_elt+0); \ unsigned int d1 = *(dptr_elt+1); \ unsigned int d2 = *(dptr_elt+2); \ unsigned int d3 = *(dptr_elt+3); \ TOP4(d0,d1,d2,d3,m0,m1,m2,m3,fgcolor,fgcolor,fgcolor,fgcolor); \ *(dptr_elt+0) = d0; \ *(dptr_elt+1) = d1; \ *(dptr_elt+2) = d2; \ *(dptr_elt+3) = d3; \ dptr_elt += 4; \ mptr_elt += 4; \ } \ for ( ; i < wi ; i++) { \ TOP(*dptr_elt, *mptr_elt, fgcolor); \ dptr_elt ++; \ mptr_elt ++; \ } \ mptr_line += msk_stride; \ dptr_line += dst_stride; \ } \ } BLITSM8D32_FWD_FWD(over, TROVERl, TROVERl4) //BLITSM8D32_FWD_FWD(outreverse, TROUTREVl, TROUTREVl4) #define BLITS32M32D32_FWD_FWD(NAME, TOP, TOP4) \ static void bitblit_src32_msk32_dst32_fwd_fwd_##NAME(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type xm, \ const unsigned_param_type ym, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ unsigned char* src_ptr, \ unsigned char* msk_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type msk_stride, \ const unsigned_param_type dst_stride) { \ unsigned int i, j; \ unsigned char *sptr = (src_ptr + (ys * src_stride) + xs); \ unsigned char *mptr = (msk_ptr + (ym * msk_stride) + xm); \ unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); \ unsigned char *sptr_line = sptr; \ unsigned char *mptr_line = mptr; \ unsigned char *dptr_line = dptr; \ for (j = 0 ; j < re ; j++) { \ unsigned int *sptr_elt = (unsigned int*)sptr_line; \ unsigned int *mptr_elt = (unsigned int*)mptr_line; \ unsigned int *dptr_elt = (unsigned int*)dptr_line; \ i = 0; \ if (wi > 3) for ( ; i < (wi-3) ; i+= 4) { \ unsigned int s0 = *(sptr_elt+0); \ unsigned int s1 = *(sptr_elt+1); \ unsigned int s2 = *(sptr_elt+2); \ unsigned int s3 = *(sptr_elt+3); \ unsigned int m0 = *(mptr_elt+0); \ unsigned int m1 = *(mptr_elt+1); \ unsigned int m2 = *(mptr_elt+2); \ unsigned int m3 = *(mptr_elt+3); \ unsigned int d0 = *(dptr_elt+0); \ unsigned int d1 = *(dptr_elt+1); \ unsigned int d2 = *(dptr_elt+2); \ unsigned int d3 = *(dptr_elt+3); \ TOP4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3); \ *(dptr_elt+0) = d0; \ *(dptr_elt+1) = d1; \ *(dptr_elt+2) = d2; \ *(dptr_elt+3) = d3; \ sptr_elt += 4; \ dptr_elt += 4; \ mptr_elt += 4; \ } \ for ( ; i < wi ; i++) { \ TOP(*dptr_elt, *mptr_elt, *sptr_elt); \ sptr_elt ++; \ dptr_elt ++; \ mptr_elt ++; \ } \ sptr_line += src_stride; \ mptr_line += msk_stride; \ dptr_line += dst_stride; \ } \ } BLITS32M32D32_FWD_FWD(over, TROVERh, TROVERh4) BLITS32M32D32_FWD_FWD(fover, TRFOVERh, TRFOVERh4) #define BLITS32D32_FWD_FWD(NAME, TOP, TOP4) \ static void bitblit_src32_dst32_fwd_fwd_##NAME(const unsigned_param_type xs, \ const unsigned_param_type ys, \ const unsigned_param_type wi, \ const unsigned_param_type re, \ const unsigned_param_type xd, \ const unsigned_param_type yd, \ unsigned char* src_ptr, \ unsigned char* dst_ptr, \ const unsigned_param_type src_stride, \ const unsigned_param_type dst_stride) { \ unsigned int i, j; \ unsigned char *sptr = (src_ptr + (ys * src_stride) + xs); \ unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd); \ unsigned char *sptr_line = sptr; \ unsigned char *dptr_line = dptr; \ for (j = 0 ; j < re ; j++) { \ unsigned int *sptr_elt = (unsigned int*)sptr_line; \ unsigned int *dptr_elt = (unsigned int*)dptr_line; \ i = 0; \ if (wi > 3) for ( ; i < (wi-3) ; i+= 4) { \ unsigned int s0 = *(sptr_elt+0); \ unsigned int s1 = *(sptr_elt+1); \ unsigned int s2 = *(sptr_elt+2); \ unsigned int s3 = *(sptr_elt+3); \ unsigned int d0 = *(dptr_elt+0); \ unsigned int d1 = *(dptr_elt+1); \ unsigned int d2 = *(dptr_elt+2); \ unsigned int d3 = *(dptr_elt+3); \ TOP4(d0,d1,d2,d3,s0,s1,s2,s3,s0,s1,s2,s3); \ *(dptr_elt+0) = d0; \ *(dptr_elt+1) = d1; \ *(dptr_elt+2) = d2; \ *(dptr_elt+3) = d3; \ sptr_elt += 4; \ dptr_elt += 4; \ } \ for ( ; i < wi ; i++) { \ TOP(*dptr_elt, *sptr_elt, *sptr_elt); \ sptr_elt ++; \ dptr_elt ++; \ } \ sptr_line += src_stride; \ dptr_line += dst_stride; \ } \ } BLITS32D32_FWD_FWD(over, TROVERh, TROVERh4) BLITS32D32_FWD_FWD(fover, TRFOVERh, TRFOVERh4)