Stuff for exa/composite support in X11 (SBusFPGA only for now)

2022-08-20 18:54:30 +02:00 · 2022-08-20 18:54:30 +02:00 · 6e8b0192e2
parent d9f964dd47
commit 6e8b0192e2
5 changed files with 8289 additions and 1011 deletions
--- a/nubus-to-ztex-gateware/VexRiscv_GoblinAccel_NuBus.v
+++ b/nubus-to-ztex-gateware/VexRiscv_GoblinAccel_NuBus.v
--- a/nubus-to-ztex-gateware/VexRiscv_GoblinAccel_SBus.v
+++ b/nubus-to-ztex-gateware/VexRiscv_GoblinAccel_SBus.v
--- a/nubus-to-ztex-gateware/blit_goblin.c
+++ b/nubus-to-ztex-gateware/blit_goblin.c
@ -31,25 +31,70 @@
 typedef unsigned int uint32_t;
 typedef volatile unsigned int u_int32_t;

-/*
-struct control_blitter {
-	volatile unsigned int fun;
-	volatile unsigned int done;
-	volatile unsigned short arg[8];
-};
-*/
+// X11 graphics functions
+#define	GXclear			0x0		/* 0 */
+#define GXand			0x1		/* src AND dst */
+#define GXandReverse	0x2		/* src AND NOT dst */
+#define GXcopy			0x3		/* src */
+#define GXandInverted	0x4		/* NOT src AND dst */
+#define	GXnoop			0x5		/* dst */
+#define GXxor			0x6		/* src XOR dst */
+#define GXor			0x7		/* src OR dst */
+#define GXnor			0x8		/* NOT src AND NOT dst */
+#define GXequiv			0x9		/* NOT src XOR dst */
+#define GXinvert		0xa		/* NOT dst */
+#define GXorReverse		0xb		/* src OR NOT dst */
+#define GXcopyInverted	0xc		/* NOT src */
+#define GXorInverted	0xd		/* NOT src OR dst */
+#define GXnand			0xe		/* NOT src OR NOT dst */
+#define GXset			0xf		/* 1 */

-#define FUN_BLIT_BIT            0 // hardwired in goblin_accel.py
-#define FUN_FILL_BIT            1 // hardwired in goblin_accel.py
-#define FUN_PATT_BIT            2 // hardwired in goblin_accel.py
-#define FUN_TEST_BIT            3 // hardwired in goblin_accel.py
-#define FUN_DONE_BIT           31
+// Xrender op
+#define PictOpClear           (0x80 | 0x0)
+#define PictOpSrc             (0x80 | 0x1)
+#define PictOpDst             (0x80 | 0x2)
+#define PictOpOver            (0x80 | 0x3)
+#define PictOpOverReverse     (0x80 | 0x4)
+#define PictOpIn              (0x80 | 0x5)
+#define PictOpInReverse       (0x80 | 0x6)
+#define PictOpOut             (0x80 | 0x7)
+#define PictOpOutReverse      (0x80 | 0x8)
+#define PictOpAtop            (0x80 | 0x9)
+#define PictOpAtopReverse     (0x80 | 0xa)
+#define PictOpXor             (0x80 | 0xb)
+#define PictOpAdd             (0x80 | 0xc)
+#define PictOpSaturate        (0x80 | 0xd)
+// custom, with 0x40 for 'flip src'
+#define PictOpFlipClear           (0x80 | 0x40 | 0x0)
+#define PictOpFlipSrc             (0x80 | 0x40 | 0x1)
+#define PictOpFlipDst             (0x80 | 0x40 | 0x2)
+#define PictOpFlipOver            (0x80 | 0x40 | 0x3)
+#define PictOpFlipOverReverse     (0x80 | 0x40 | 0x4)
+#define PictOpFlipIn              (0x80 | 0x40 | 0x5)
+#define PictOpFlipInReverse       (0x80 | 0x40 | 0x6)
+#define PictOpFlipOut             (0x80 | 0x40 | 0x7)
+#define PictOpFlipOutReverse      (0x80 | 0x40 | 0x8)
+#define PictOpFlipAtop            (0x80 | 0x40 | 0x9)
+#define PictOpFlipAtopReverse     (0x80 | 0x40 | 0xa)
+#define PictOpFlipXor             (0x80 | 0x40 | 0xb)
+#define PictOpFlipAdd             (0x80 | 0x40 | 0xc)
+#define PictOpFlipSaturate        (0x80 | 0x40 | 0xd)

-#define FUN_BLIT           (1<<FUN_BLIT_BIT)
-#define FUN_FILL           (1<<FUN_FILL_BIT)
-#define FUN_PATT           (1<<FUN_PATT_BIT)
-#define FUN_TEST           (1<<FUN_TEST_BIT)
-#define FUN_DONE           (1<<FUN_DONE_BIT)
+#define FUN_BLIT_BIT             0 // hardwired in goblin_accel.py
+#define FUN_FILL_BIT             1 // hardwired in goblin_accel.py
+#define FUN_PATT_BIT             2 // hardwired in goblin_accel.py
+#define FUN_RSMSK8DST32_BIT      3 // hardwired in goblin_accel.py
+#define FUN_RSRC32MSK32DST32_BIT 4 // hardwired in goblin_accel.py
+#define FUN_RSRC32DST32_BIT      5 // hardwired in goblin_accel.py
+#define FUN_DONE_BIT             31
+
+#define FUN_BLIT             (1<<FUN_BLIT_BIT)
+#define FUN_FILL             (1<<FUN_FILL_BIT)
+#define FUN_PATT             (1<<FUN_PATT_BIT)
+#define FUN_RSMSK8DST32      (1<<FUN_RSMSK8DST32_BIT)
+#define FUN_RSRC32MSK32DST32 (1<<FUN_RSRC32MSK32DST32_BIT)
+#define FUN_RSRC32DST32      (1<<FUN_RSRC32DST32_BIT)
+#define FUN_DONE             (1<<FUN_DONE_BIT)

 struct goblin_bt_regs {
 	u_int32_t mode;
@ -75,11 +120,11 @@ struct goblin_accel_regs {
 	u_int32_t reg_status; // 0
 	u_int32_t reg_cmd;
 	u_int32_t reg_r5_cmd;
-	u_int32_t resv0;
+	u_int32_t reg_op; // 3; X11 op or (0x80 | Render op)
 	u_int32_t reg_width; // 4
 	u_int32_t reg_height;
 	u_int32_t reg_fgcolor;
-	u_int32_t resv2;
+	u_int32_t reg_depth; // 7; 0 is native
 	u_int32_t reg_bitblt_src_x; // 8
 	u_int32_t reg_bitblt_src_y;
 	u_int32_t reg_bitblt_dst_x;
@ -88,6 +133,11 @@ struct goblin_accel_regs {
 	u_int32_t reg_dst_stride; // 13
 	u_int32_t reg_src_ptr; // 14
 	u_int32_t reg_dst_ptr; // 15
+	
+	u_int32_t reg_bitblt_msk_x; // 16
+	u_int32_t reg_bitblt_msk_y;
+	u_int32_t reg_msk_stride; // 18
+	u_int32_t reg_msk_ptr; // 19
 };

 //#include "./rvintrin.h"
@ -169,10 +219,48 @@ static void patternrectfill(const unsigned_param_type xd,
 							const unsigned_param_type dst_stride
 							);

-static void print_hexword(unsigned int v, unsigned int bx, unsigned int by);
-static void show_status_on_screen(void);
+static void bitblit_solid_msk8_dst32_fwd_fwd(const unsigned char op,
+											  const unsigned_param_type xm,
+											  const unsigned_param_type ym,
+											  const unsigned_param_type wi,
+											  const unsigned_param_type re,
+											  const unsigned_param_type xd,
+											  const unsigned_param_type yd,
+											  const unsigned int fgcolor,
+											  unsigned char* msk_ptr,
+											  unsigned char* dst_ptr,
+											  const unsigned_param_type msk_stride,
+											  const unsigned_param_type dst_stride);

-asm(".global _start\n"
+static void bitblit_src32_msk32_dst32_fwd_fwd(const unsigned char op,
+											   const unsigned_param_type xs,
+											   const unsigned_param_type ys,
+											   const unsigned_param_type xm,
+											   const unsigned_param_type ym,
+											   const unsigned_param_type wi,
+											   const unsigned_param_type re,
+											   const unsigned_param_type xd,
+											   const unsigned_param_type yd,
+											   unsigned char* src_ptr,
+											   unsigned char* msk_ptr,
+											   unsigned char* dst_ptr,
+											   const unsigned_param_type src_stride,
+											   const unsigned_param_type msk_stride,
+											   const unsigned_param_type dst_stride);
+
+static void bitblit_src32_dst32_fwd_fwd(const unsigned char op,
+										const unsigned_param_type xs,
+										const unsigned_param_type ys,
+										const unsigned_param_type wi,
+										const unsigned_param_type re,
+										const unsigned_param_type xd,
+										const unsigned_param_type yd,
+										unsigned char* src_ptr,
+										unsigned char* dst_ptr,
+										const unsigned_param_type src_stride,
+										const unsigned_param_type dst_stride);
+
+	asm(".global _start\n"
 	"_start:\n"
 	// ".word 0x0000500F\n" // flush cache ; should not be needed after reset
 	//"addi sp,zero,66\n" // 0x0042
@ -220,54 +308,80 @@ void from_reset(void) {
 	struct goblin_accel_regs* fbc = (struct goblin_accel_regs*)BASE_ACCEL_REGS;
 	struct goblin_bt_regs* fbt = (struct goblin_bt_regs*)BASE_BT_REGS;
 	unsigned int cmd = fbc->reg_r5_cmd;
+	unsigned char depth = fbc->reg_depth;
+	unsigned char op = fbc->reg_op;
 	uint32_t srcx, wi, dstx;
+	if (depth == 0) {
 #if defined(GOBLIN_NUBUS)
-	switch ((fbt->mode>>24) & 0xFF)  // mode is 8 bits wrong-endian (all fbt is wrong-endian in NuBus version)
+		switch ((fbt->mode>>24) & 0xFF)  // mode is 8 bits wrong-endian (all fbt is wrong-endian in NuBus version)
 #elif defined(GOBLIN_SBUS)
-	switch (fbt->mode & 0xFF)
+		switch (fbt->mode & 0xFF)
 #else
 #error "Must define GOBLIN_NUBUS or GOBLIN_SBUS"
 #endif
-		{
+	{
 	case mode_32bit:
+		depth = 32;
+		break;
+	case mode_16bit:
+		depth = 16;
+		break;
+	default:
+	case mode_8bit:
+		depth = 8;
+		break;
+	case mode_4bit:
+		depth = 4;
+		break;
+	case mode_2bit:
+		depth = 2;
+		break;
+	case mode_1bit:
+		depth = 1;
+		break;
+	}
+	}
+	switch (depth)
+		{
+	case 32:
 		srcx = fbc->reg_bitblt_src_x << 2;
 		wi   = fbc->reg_width        << 2;
 		dstx = fbc->reg_bitblt_dst_x << 2;
 		break;
-	case mode_16bit:
+	case 16:
 		srcx = fbc->reg_bitblt_src_x << 1;
 		wi   = fbc->reg_width        << 1;
 		dstx = fbc->reg_bitblt_dst_x << 1;
 		break;
 	default:
-	case mode_8bit:
+	case 8:
 		srcx = fbc->reg_bitblt_src_x;
 		wi   = fbc->reg_width;
 		dstx = fbc->reg_bitblt_dst_x;
 		break;
-	case mode_4bit:
+	case 4:
 		srcx = fbc->reg_bitblt_src_x >> 1;
 		wi   = fbc->reg_width        >> 1;
 		dstx = fbc->reg_bitblt_dst_x >> 1;
 		break;
-	case mode_2bit:
+	case 2:
 		srcx = fbc->reg_bitblt_src_x >> 2;
 		wi   = fbc->reg_width        >> 2;
 		dstx = fbc->reg_bitblt_dst_x >> 2;
 		break;
-	case mode_1bit:
+	case 1:
 		srcx = fbc->reg_bitblt_src_x >> 3;
 		wi   = fbc->reg_width        >> 3;
 		dstx = fbc->reg_bitblt_dst_x >> 3;
 		break;
 	}

-	switch (cmd & 0xF) {
+	switch (cmd & 0xFF) {
 	case FUN_BLIT: {
 		bitblit(srcx, fbc->reg_bitblt_src_y,
 				wi  , fbc->reg_height,
 				dstx, fbc->reg_bitblt_dst_y,
-				0xFF, 0x3, // GXcopy
+				0xFF, op, // FIXME: re-add planemask support for X11 ops
 				fbc->reg_src_ptr ? (unsigned char*)fbc->reg_src_ptr : (unsigned char*)BASE_FB,
 				fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB,
 				fbc->reg_src_stride,
@ -290,6 +404,50 @@ void from_reset(void) {
 						fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB,
 						fbc->reg_dst_stride); // assumed to be scaled already
 	} break;
+	case FUN_RSMSK8DST32: {
+		bitblit_solid_msk8_dst32_fwd_fwd(op,
+										  fbc->reg_bitblt_msk_x, // unscaled
+										  fbc->reg_bitblt_msk_y,
+										  fbc->reg_width, // NOT scaled here, we assume depth == 32 here
+										  fbc->reg_height,
+										  dstx, // still scaled for the PTR calculation ...
+										  fbc->reg_bitblt_dst_y,
+										  fbc->reg_fgcolor,
+										  fbc->reg_msk_ptr ? (unsigned char*)fbc->reg_msk_ptr : (unsigned char*)BASE_FB,
+										  fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB,
+										  fbc->reg_msk_stride, // assumed to be scaled already
+										  fbc->reg_dst_stride); // assumed to be scaled already
+	} break;
+	case FUN_RSRC32MSK32DST32: {
+		bitblit_src32_msk32_dst32_fwd_fwd(op,
+										   fbc->reg_bitblt_src_x, // unscaled
+										   fbc->reg_bitblt_src_y,
+										   fbc->reg_bitblt_msk_x, // unscaled
+										   fbc->reg_bitblt_msk_y,
+										   fbc->reg_width, // NOT scaled here, we assume depth == 32 here
+										   fbc->reg_height,
+										   dstx, // still scaled for the PTR calculation ...
+										   fbc->reg_bitblt_dst_y,
+										   fbc->reg_src_ptr ? (unsigned char*)fbc->reg_src_ptr : (unsigned char*)BASE_FB,
+										   fbc->reg_msk_ptr ? (unsigned char*)fbc->reg_msk_ptr : (unsigned char*)BASE_FB,
+										   fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB,
+										   fbc->reg_src_stride, // assumed to be scaled already
+										   fbc->reg_msk_stride, // assumed to be scaled already
+										   fbc->reg_dst_stride); // assumed to be scaled already
+	} break;
+	case FUN_RSRC32DST32: {
+		bitblit_src32_dst32_fwd_fwd(op,
+									fbc->reg_bitblt_src_x, // unscaled
+									fbc->reg_bitblt_src_y,
+									fbc->reg_width, // NOT scaled here, we assume depth == 32 here
+									fbc->reg_height,
+									dstx, // still scaled for the PTR calculation ...
+									fbc->reg_bitblt_dst_y,
+									fbc->reg_src_ptr ? (unsigned char*)fbc->reg_src_ptr : (unsigned char*)BASE_FB,
+									fbc->reg_dst_ptr ? (unsigned char*)fbc->reg_dst_ptr : (unsigned char*)BASE_FB,
+									fbc->reg_src_stride, // assumed to be scaled already
+									fbc->reg_dst_stride); // assumed to be scaled already
+	} break;
 	default:
 		break;
 	}
@ -330,6 +488,8 @@ bitblit_proto(_xor);
 bitblit_proto(_copy_pm);
 bitblit_proto(_xor_pm);

+bitblit_proto(_radd);
+

 #define ROUTE_BITBLIT_PM(pm, bb)							\
 	if (pm == 0xFF) bb(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride); \
@ -352,47 +512,59 @@ static void bitblit(const unsigned_param_type xs,
 	
 	if (ys > yd) {
 		switch(gxop) {
-		case 0x3: // GXcopy
+		case GXcopy:
 			ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_copy);
 			break;
-		case 0x6: // GXxor
+		case GXxor:
 			ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_xor);
 			break;
+		case PictOpAdd:
+			bitblit_fwd_fwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride);
+			break;
 		}
 	} else if (ys < yd) {
 		switch(gxop) {
-		case 0x3: // GXcopy
+		case GXcopy:
 			ROUTE_BITBLIT_PM(pm, bitblit_bwd_fwd_copy);
 			break;
-		case 0x6: // GXxor
+		case GXxor:
 			ROUTE_BITBLIT_PM(pm, bitblit_bwd_fwd_xor);
 			break;
+		case PictOpAdd:
+			bitblit_bwd_fwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride);
+			break;
 		}
 	} else { // ys == yd
 		if (xs > xd) {
 			switch(gxop) {
-			case 0x3: // GXcopy
+			case GXcopy:
 				ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_copy);
 				break;
-			case 0x6: // GXxor
+			case GXxor:
 				ROUTE_BITBLIT_PM(pm, bitblit_fwd_fwd_xor);
 				break;
+			case PictOpAdd:
+				bitblit_fwd_fwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride);
+				break;
 			}
 		} else if (xs < xd) {
 			switch(gxop) {
-			case 0x3: // GXcopy
+			case GXcopy:
 				ROUTE_BITBLIT_PM(pm, bitblit_fwd_bwd_copy);
 				break;
-			case 0x6: // GXxor
+			case GXxor:
 				ROUTE_BITBLIT_PM(pm, bitblit_fwd_bwd_xor);
 				break;
+			case PictOpAdd:
+				bitblit_fwd_bwd_radd(xs, ys, wi, re, xd, yd, pm, src_ptr, dst_ptr, src_stride, dst_stride);
+				break;
 			}
 		} else { // xs == xd
 			switch(gxop) {
-			case 0x3: // GXcopy
+			case GXcopy:
 				/* don't bother */
 				break;
-			case 0x6:  // GXxor
+			case GXxor:
 				rectfill_pm(xd, yd, wi, re, 0, pm, dst_ptr, dst_stride);
 				break;
 			}
@ -423,20 +595,19 @@ static void rectfill(const unsigned_param_type xd,
 			dptr_elt ++;
 		}
 		if (wi > 3) {
-			unsigned int u32color = (unsigned int)u8color | ((unsigned int)u8color)<<8 | ((unsigned int)u8color)<<16 | ((unsigned int)u8color)<<24;
 			if ((wi>15) && (((unsigned int)dptr_elt&0x7)==0)) {
 				register unsigned int s8 asm("s8");
 				register unsigned int s9 asm("s9");
-				s8 = u32color;
-				s9 = u32color;
+				s8 = color;
+				s9 = color;
 				for ( ; i < (wi-15) ; i+=16) {
-					sd(dptr_elt, 0, 0, s8, s9);
-					sd(dptr_elt, 8, 0, s8, s9);
+					_custom_sd(dptr_elt, 0, 0, s8, s9);
+					_custom_sd(dptr_elt, 8, 0, s8, s9);
 					dptr_elt += 16;
 				}
 			}
 			for ( ; i < (wi-3) ; i+=4) {
-				*(unsigned int*)dptr_elt = u32color;
+				*(unsigned int*)dptr_elt = color;
 				dptr_elt +=4;
 			}
 		}	
@ -471,10 +642,9 @@ static void rectfill_pm(const unsigned_param_type xd,
 			dptr_elt ++;
 		}
 		if (wi > 3) {
-			unsigned int u32color = (unsigned int)u8color | ((unsigned int)u8color)<<8 | ((unsigned int)u8color)<<16 | ((unsigned int)u8color)<<24;
 			unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24;
 			for ( ; i < (wi-3) ; i+=4) {
-				*(unsigned int*)dptr_elt = (u32color & u32pm) | (*(unsigned int*)dptr_elt & ~u32pm);
+				*(unsigned int*)dptr_elt = (color & u32pm) | (*(unsigned int*)dptr_elt & ~u32pm);
 				dptr_elt +=4;
 			}
 		}
@ -509,9 +679,8 @@ static void xorrectfill(const unsigned_param_type xd,
 			dptr_elt ++;
 		}
 		if (wi > 3) {
-			unsigned int u32color = (unsigned int)u8color | ((unsigned int)u8color)<<8 | ((unsigned int)u8color)<<16 | ((unsigned int)u8color)<<24;
 			for ( ; i < (wi-3) ; i+=4) {
-				*(unsigned int*)dptr_elt ^= u32color;
+				*(unsigned int*)dptr_elt ^= color;
 				dptr_elt +=4;
 			}
 		}	
@ -545,10 +714,9 @@ static void xorrectfill_pm(const unsigned_param_type xd,
 			dptr_elt ++;
 		}
 		if (wi > 3) {
-			unsigned int u32color = (unsigned int)u8color | ((unsigned int)u8color)<<8 | ((unsigned int)u8color)<<16 | ((unsigned int)u8color)<<24;
 			unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24;
 			for ( ; i < (wi-3) ; i+=4) {
-				*(unsigned int*)dptr_elt ^= (u32color & u32pm);
+				*(unsigned int*)dptr_elt ^= (color & u32pm);
 				dptr_elt +=4;
 			}
 		}
@ -593,7 +761,7 @@ static void invert(const unsigned_param_type xd,
 	}
 }

-
+// X11
 // NOT using npm enables the use of 'cmix' in more cases
 #define COPY(d,s,pm,npm) (d) = (s)
 //#define COPY_PM(d,s,pm,npm) (d) = (((s) & (pm)) | ((d) & (npm)))
@ -601,6 +769,8 @@ static void invert(const unsigned_param_type xd,
 #define XOR(d,s,pm,npm) (d) = ((s) ^ (d))
 //#define XOR_PM(d,s,pm,npm) (d) = ((((s) ^ (d)) & (pm)) | ((d) & (npm)))
 #define XOR_PM(d,s,pm,npm) (d) = ((((s) ^ (d)) & (pm)) | ((d) & (~pm)))
+// Xrender
+#define RADD(d,s,pm,npm) (d) = ukadd8((d), (s))

 #define BLIT_FWD_FWD(NAME, OP)											\
 	static void bitblit_fwd_fwd_##NAME(const unsigned_param_type xs,	\
@ -792,7 +962,7 @@ static void invert(const unsigned_param_type xd,
 	BLIT_FWD_BWD(NAME, OP)					\
 	BLIT_BWD_FWD(NAME, OP)					\
 		
-#define BLIT_NOTALLDIR(NAME, OP)				\
+#define BLIT_NOTALLDIR(NAME, OP)			\
 	BLIT_FWD_BWD(NAME, OP)					\
 	BLIT_BWD_FWD(NAME, OP)					\
 	
@ -802,6 +972,7 @@ BLIT_ALLDIR(xor, XOR)
 BLIT_ALLDIR(copy_pm, COPY_PM)
 BLIT_ALLDIR(xor_pm, XOR_PM)
 	
+BLIT_ALLDIR(radd, RADD)
 	
 static void bitblit_fwd_fwd_copy(const unsigned_param_type xs,
 								 const unsigned_param_type ys,
@ -849,7 +1020,7 @@ static void bitblit_fwd_fwd_copy(const unsigned_param_type xs,
 				}
 				sptr_elt = sptr_elt_al + ((unsigned int)sptr_elt & 0x3);
 			} else if ((xs & 0x7) != (xd & 0x7)) {
-				/* off-hy-4, can't use 64 ld/sd but still can use 32-bits */
+				/* off-hy-4, can't use 64 ld/sd directly (could pipeline the 32-bits data) but still can use 32-bits */
 				const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24;
 				const unsigned char* dptr_elt_end = dptr_elt + wi;
 				/* align dest & src (they are aligned the same here up to 0x3) */
@ -888,17 +1059,17 @@ static void bitblit_fwd_fwd_copy(const unsigned_param_type xs,
 					register unsigned int s9 asm("s9");
 					register unsigned int s10 asm("s10");
 					register unsigned int s11 asm("s11");
-					ld(sptr_elt, 0, s4, s5);
-					ld(sptr_elt, 16, s8, s9);
+					_custom_ld(sptr_elt, 0, s4, s5);
+					_custom_ld(sptr_elt, 16, s8, s9);
 					
-					ld(sptr_elt, 8, s6, s7);
-					sd(dptr_elt, 0, 0, s4, s5);
-					sd(dptr_elt, 8, 0, s6, s7);
+					_custom_ld(sptr_elt, 8, s6, s7);
+					_custom_sd(dptr_elt, 0, 0, s4, s5);
+					_custom_sd(dptr_elt, 8, 0, s6, s7);
 					
-					ld(sptr_elt, 24, s10, s11);
-					sd(dptr_elt, 16, 0, s8, s9);
+					_custom_ld(sptr_elt, 24, s10, s11);
+					_custom_sd(dptr_elt, 16, 0, s8, s9);
 					sptr_elt += 32;
-					sd(dptr_elt, 24, 0, s10, s11);
+					_custom_sd(dptr_elt, 24, 0, s10, s11);
 					dptr_elt += 32;
 					
 				}
@ -908,19 +1079,19 @@ static void bitblit_fwd_fwd_copy(const unsigned_param_type xs,
 					register unsigned int s9 asm("s9");
 					register unsigned int s10 asm("s10");
 					register unsigned int s11 asm("s11");
-					ld(sptr_elt, 0, s8, s9);
-					ld(sptr_elt, 8, s10, s11);
-					sd(dptr_elt, 0, 0, s8, s9);
+					_custom_ld(sptr_elt, 0, s8, s9);
+					_custom_ld(sptr_elt, 8, s10, s11);
+					_custom_sd(dptr_elt, 0, 0, s8, s9);
 					sptr_elt += 16;
-					sd(dptr_elt, 8, 0, s10, s11);
+					_custom_sd(dptr_elt, 8, 0, s10, s11);
 					dptr_elt += 16;
 				}
 #if 0
 				for ( ; (dptr_elt < (dptr_elt_last-7)) ; ) {
 					register unsigned int s8 asm("s8");
 					register unsigned int s9 asm("s9");
-					ld(sptr_elt, 0, s8, s9);
-					sd(dptr_elt, 0, 0, s8, s9);
+					_custom_ld(sptr_elt, 0, s8, s9);
+					_custom_sd(dptr_elt, 0, 0, s8, s9);
 					sptr_elt += 8;
 					dptr_elt += 8;
 				}
@ -993,3 +1164,340 @@ static void patternrectfill(const unsigned_param_type xd,
 		pat_ptr_line = pat_ptr + ((j+jo) & pat_ymask) * pat_stride;
 	}
 }
+
+#define bitblit_render_proto(a, b, suf) \
+	static void bitblit_solid_msk8_dst32##a##b##suf(const unsigned_param_type xm, \
+													const unsigned_param_type ym, \
+													const unsigned_param_type wi, \
+													const unsigned_param_type re, \
+													const unsigned_param_type xd, \
+													const unsigned_param_type yd, \
+													const unsigned int fgcolor, \
+													unsigned char* msk_ptr, \
+													unsigned char* dst_ptr, \
+													const unsigned_param_type msk_stride, \
+													const unsigned_param_type dst_stride); \
+	static void bitblit_src32_msk32_dst32##a##b##suf(const unsigned_param_type xs, \
+													 const unsigned_param_type ys, \
+													 const unsigned_param_type xm, \
+													 const unsigned_param_type ym, \
+													 const unsigned_param_type wi, \
+													 const unsigned_param_type re, \
+													 const unsigned_param_type xd, \
+													 const unsigned_param_type yd, \
+													 unsigned char* src_ptr, \
+													 unsigned char* msk_ptr, \
+													 unsigned char* dst_ptr, \
+													 const unsigned_param_type src_stride, \
+													 const unsigned_param_type msk_stride, \
+													 const unsigned_param_type dst_stride);	\
+	static void bitblit_src32_dst32##a##b##suf(const unsigned_param_type xs, \
+											   const unsigned_param_type ys, \
+											   const unsigned_param_type wi, \
+											   const unsigned_param_type re, \
+											   const unsigned_param_type xd, \
+											   const unsigned_param_type yd, \
+											   unsigned char* src_ptr,	\
+											   unsigned char* dst_ptr,	\
+											   const unsigned_param_type src_stride, \
+											   const unsigned_param_type dst_stride);
+
+bitblit_render_proto(_fwd, _fwd, _over)
+bitblit_render_proto(_fwd, _fwd, _fover)
+bitblit_render_proto(_fwd, _fwd, _outreverse)
+
+
+static void bitblit_solid_msk8_dst32_fwd_fwd(const unsigned char op,
+											  const unsigned_param_type xm,
+											  const unsigned_param_type ym,
+											  const unsigned_param_type wi,
+											  const unsigned_param_type re,
+											  const unsigned_param_type xd,
+											  const unsigned_param_type yd,
+											  const unsigned int fgcolor,
+											  unsigned char* msk_ptr,
+											  unsigned char* dst_ptr,
+											  const unsigned_param_type msk_stride,
+											  const unsigned_param_type dst_stride) {
+	switch (op) {
+	case PictOpOver:
+		bitblit_solid_msk8_dst32_fwd_fwd_over(xm, ym, wi, re, xd, yd, fgcolor, msk_ptr, dst_ptr, msk_stride, dst_stride);
+		break;
+	/* case PictOpOutReverse: */
+	/* 	bitblit_solid_msk8_dst32_fwd_fwd_outreverse(xm, ym, wi, re, xd, yd, fgcolor, msk_ptr, dst_ptr, msk_stride, dst_stride); */
+	/* 	break; */
+	default:
+		break;
+	}
+}
+static void bitblit_src32_msk32_dst32_fwd_fwd(const unsigned char op,
+											  const unsigned_param_type xs,
+											  const unsigned_param_type ys,
+											  const unsigned_param_type xm,
+											  const unsigned_param_type ym,
+											  const unsigned_param_type wi,
+											  const unsigned_param_type re,
+											  const unsigned_param_type xd,
+											  const unsigned_param_type yd,
+											  unsigned char* src_ptr,
+											  unsigned char* msk_ptr,
+											  unsigned char* dst_ptr,
+											  const unsigned_param_type src_stride,
+											  const unsigned_param_type msk_stride,
+											  const unsigned_param_type dst_stride)
+{
+	switch (op) {
+	case PictOpOver:
+		bitblit_src32_msk32_dst32_fwd_fwd_over(xs, ys, xm, ym, wi, re, xd, yd, src_ptr, msk_ptr, dst_ptr, src_stride, msk_stride, dst_stride);
+		break;
+	default:
+		break;
+	}
+}
+static void bitblit_src32_dst32_fwd_fwd(const unsigned char op,
+										const unsigned_param_type xs,
+										const unsigned_param_type ys,
+										const unsigned_param_type wi,
+										const unsigned_param_type re,
+										const unsigned_param_type xd,
+										const unsigned_param_type yd,
+										unsigned char* src_ptr,
+										unsigned char* dst_ptr,
+										const unsigned_param_type src_stride,
+										const unsigned_param_type dst_stride)
+{
+	switch (op) {
+	case PictOpOver:
+		bitblit_src32_dst32_fwd_fwd_over(xs, ys, wi, re, xd, yd, src_ptr, dst_ptr, src_stride, dst_stride);
+		break;
+	case PictOpFlipOver:
+		bitblit_src32_dst32_fwd_fwd_fover(xs, ys, wi, re, xd, yd, src_ptr, dst_ptr, src_stride, dst_stride);
+		break;
+	default:
+		break;
+	}
+}
+
+// Xrender
+//#define TROVER(d,m,s) (d) = (m)*(s) + (d)*(0xff ^ (m)))
+#define TROVERl(d,m,s) (d) = ufma8vlv((s), (m), ufma8vlv((d), (0xffffffff^(m)), 0))
+#define TROVERl4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3)	\
+	(d0) = ufma8vlv((d0), (0xffffffff^(m0)), 0);		\
+	(d1) = ufma8vlv((d1), (0xffffffff^(m1)), 0);		\
+	(d2) = ufma8vlv((d2), (0xffffffff^(m2)), 0);		\
+	(d3) = ufma8vlv((d3), (0xffffffff^(m3)), 0);		\
+	(d0) = ufma8vlv((s0), (m0), (d0));					\
+	(d1) = ufma8vlv((s1), (m1), (d1));					\
+	(d2) = ufma8vlv((s2), (m2), (d2));					\
+	(d3) = ufma8vlv((s3), (m3), (d3))
+
+#define TROVERh(d,m,s) (d) = ufma8vhv((s), (m), ufma8vhv((d), (0xffffffff^(m)), 0))
+#define TROVERh4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3)	\
+	(d0) = ufma8vhv((d0), (0xffffffff^(m0)), 0);		\
+	(d1) = ufma8vhv((d1), (0xffffffff^(m1)), 0);		\
+	(d2) = ufma8vhv((d2), (0xffffffff^(m2)), 0);		\
+	(d3) = ufma8vhv((d3), (0xffffffff^(m3)), 0);		\
+	(d0) = ufma8vhv((s0), (m0), (d0));					\
+	(d1) = ufma8vhv((s1), (m1), (d1));					\
+	(d2) = ufma8vhv((s2), (m2), (d2));					\
+	(d3) = ufma8vhv((s3), (m3), (d3))
+
+/* 
+   3210
+   0321 // fsr by 8 ; could be rot
+*/
+
+static inline uint32_t pixelswap(const uint32_t p) {
+	/* uint32_t r = __builtin_bswap32(p); */
+	/* asm("fsr %0, %1, %2, %3\n" : "=r"(r) : "r"(r), "r"(r), "r"(8)); */
+	uint32_t r;
+	asm("fsr %0, %1, %2, %3\n" : "=r"(r) : "r"(p), "r"(p), "r"(8));
+	return r;
+}
+
+#define TRFOVERh(d,m,s) (d) = (ufma8vlv(pixelswap(s), (m), ufma8vlv((d), (0xffffffff^(m)), 0)))
+#define TRFOVERh4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3)	\
+	(d0) = ufma8vlv((d0), (0xffffffff^(m0)), 0);		\
+	(d1) = ufma8vlv((d1), (0xffffffff^(m1)), 0);		\
+	(d2) = ufma8vlv((d2), (0xffffffff^(m2)), 0);		\
+	(d3) = ufma8vlv((d3), (0xffffffff^(m3)), 0);		\
+	(d0) = (ufma8vlv(pixelswap(s0), (m0), (d0)));	\
+	(d1) = (ufma8vlv(pixelswap(s1), (m1), (d1)));	\
+	(d2) = (ufma8vlv(pixelswap(s2), (m2), (d2)));	\
+	(d3) = (ufma8vlv(pixelswap(s3), (m3), (d3)))
+
+#define TROUTREVl(d,m,s) (d) = ufma8vlv((d), (0xffffffff^(m)), 0)
+#define TROUTREVl4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3)	\
+	(d0) = ufma8vlv((d0), (0xffffffff^(m0)), 0);		\
+	(d1) = ufma8vlv((d1), (0xffffffff^(m1)), 0);		\
+	(d2) = ufma8vlv((d2), (0xffffffff^(m2)), 0);		\
+	(d3) = ufma8vlv((d3), (0xffffffff^(m3)), 0)
+
+#define BLITSM8D32_FWD_FWD(NAME, TOP, TOP4)								\
+	static void bitblit_solid_msk8_dst32_fwd_fwd_##NAME(const unsigned_param_type xm, \
+														 const unsigned_param_type ym, \
+														 const unsigned_param_type wi, \
+														 const unsigned_param_type re, \
+														 const unsigned_param_type xd, \
+														 const unsigned_param_type yd, \
+														 const unsigned int fgcolor, \
+														 unsigned char* msk_ptr, \
+														 unsigned char* dst_ptr, \
+														 const unsigned_param_type msk_stride, \
+														 const unsigned_param_type dst_stride) { \
+		unsigned int i, j;												\
+		unsigned char *mptr = (msk_ptr + (ym * msk_stride) + xm);		\
+		unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd);		\
+		unsigned char *mptr_line = mptr;								\
+		unsigned char *dptr_line = dptr;								\
+		for (j = 0 ; j < re ; j++) {									\
+			unsigned char *mptr_elt = mptr_line;						\
+			unsigned int *dptr_elt = (unsigned int*)dptr_line;			\
+			i = 0;														\
+			if (wi > 3) for ( ; i < (wi-3) ; i+= 4) {					\
+				unsigned char m0 = *(mptr_elt+0);						\
+				unsigned char m1 = *(mptr_elt+1);						\
+				unsigned char m2 = *(mptr_elt+2);						\
+				unsigned char m3 = *(mptr_elt+3);						\
+				unsigned int d0 = *(dptr_elt+0);						\
+				unsigned int d1 = *(dptr_elt+1);						\
+				unsigned int d2 = *(dptr_elt+2);						\
+				unsigned int d3 = *(dptr_elt+3);						\
+				TOP4(d0,d1,d2,d3,m0,m1,m2,m3,fgcolor,fgcolor,fgcolor,fgcolor); \
+				*(dptr_elt+0) = d0;										\
+				*(dptr_elt+1) = d1;										\
+				*(dptr_elt+2) = d2;										\
+				*(dptr_elt+3) = d3;										\
+				dptr_elt += 4;											\
+				mptr_elt += 4;											\
+			}															\
+			for ( ; i < wi ; i++) {										\
+				TOP(*dptr_elt, *mptr_elt, fgcolor);						\
+				dptr_elt ++;											\
+				mptr_elt ++;											\
+			}															\
+			mptr_line += msk_stride;									\
+			dptr_line += dst_stride;									\
+		}																\
+	}
+	
+BLITSM8D32_FWD_FWD(over, TROVERl, TROVERl4)
+//BLITSM8D32_FWD_FWD(outreverse, TROUTREVl, TROUTREVl4)
+
+	
+#define BLITS32M32D32_FWD_FWD(NAME, TOP, TOP4)								\
+	static void bitblit_src32_msk32_dst32_fwd_fwd_##NAME(const unsigned_param_type xs, \
+														  const unsigned_param_type ys, \
+														  const unsigned_param_type xm, \
+														  const unsigned_param_type ym, \
+														  const unsigned_param_type wi, \
+														  const unsigned_param_type re, \
+														  const unsigned_param_type xd, \
+														  const unsigned_param_type yd, \
+														  unsigned char* src_ptr, \
+														  unsigned char* msk_ptr, \
+														  unsigned char* dst_ptr, \
+														  const unsigned_param_type src_stride, \
+														  const unsigned_param_type msk_stride, \
+														  const unsigned_param_type dst_stride) { \
+		unsigned int i, j;												\
+		unsigned char *sptr = (src_ptr + (ys * src_stride) + xs);		\
+		unsigned char *mptr = (msk_ptr + (ym * msk_stride) + xm);		\
+		unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd);		\
+		unsigned char *sptr_line = sptr;								\
+		unsigned char *mptr_line = mptr;								\
+		unsigned char *dptr_line = dptr;								\
+		for (j = 0 ; j < re ; j++) {									\
+			unsigned int *sptr_elt = (unsigned int*)sptr_line;			\
+			unsigned int *mptr_elt = (unsigned int*)mptr_line;			\
+			unsigned int *dptr_elt = (unsigned int*)dptr_line;			\
+			i = 0;														\
+			if (wi > 3) for ( ; i < (wi-3) ; i+= 4) {					\
+				unsigned int s0 = *(sptr_elt+0);						\
+				unsigned int s1 = *(sptr_elt+1);						\
+				unsigned int s2 = *(sptr_elt+2);						\
+				unsigned int s3 = *(sptr_elt+3);						\
+				unsigned int m0 = *(mptr_elt+0);						\
+				unsigned int m1 = *(mptr_elt+1);						\
+				unsigned int m2 = *(mptr_elt+2);						\
+				unsigned int m3 = *(mptr_elt+3);						\
+				unsigned int d0 = *(dptr_elt+0);						\
+				unsigned int d1 = *(dptr_elt+1);						\
+				unsigned int d2 = *(dptr_elt+2);						\
+				unsigned int d3 = *(dptr_elt+3);						\
+				TOP4(d0,d1,d2,d3,m0,m1,m2,m3,s0,s1,s2,s3);				\
+				*(dptr_elt+0) = d0;										\
+				*(dptr_elt+1) = d1;										\
+				*(dptr_elt+2) = d2;										\
+				*(dptr_elt+3) = d3;										\
+				sptr_elt += 4;											\
+				dptr_elt += 4;											\
+				mptr_elt += 4;											\
+			}															\
+			for ( ; i < wi ; i++) {										\
+				TOP(*dptr_elt, *mptr_elt, *sptr_elt);					\
+				sptr_elt ++;											\
+				dptr_elt ++;											\
+				mptr_elt ++;											\
+			}															\
+			sptr_line += dst_stride;									\
+			mptr_line += msk_stride;									\
+			dptr_line += dst_stride;									\
+		}																\
+	}
+	
+
+BLITS32M32D32_FWD_FWD(over, TROVERh, TROVERh4)
+	
+
+	
+#define BLITS32D32_FWD_FWD(NAME, TOP, TOP4)								\
+	static void bitblit_src32_dst32_fwd_fwd_##NAME(const unsigned_param_type xs, \
+												   const unsigned_param_type ys, \
+												   const unsigned_param_type wi, \
+												   const unsigned_param_type re, \
+												   const unsigned_param_type xd, \
+												   const unsigned_param_type yd, \
+												   unsigned char* src_ptr, \
+												   unsigned char* dst_ptr, \
+												   const unsigned_param_type src_stride, \
+												   const unsigned_param_type dst_stride) { \
+		unsigned int i, j;												\
+		unsigned char *sptr = (src_ptr + (ys * src_stride) + xs);		\
+		unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd);		\
+		unsigned char *sptr_line = sptr;								\
+		unsigned char *dptr_line = dptr;								\
+		for (j = 0 ; j < re ; j++) {									\
+			unsigned int *sptr_elt = (unsigned int*)sptr_line;			\
+			unsigned int *dptr_elt = (unsigned int*)dptr_line;			\
+			i = 0;														\
+			if (wi > 3) for ( ; i < (wi-3) ; i+= 4) {					\
+				unsigned int s0 = *(sptr_elt+0);						\
+				unsigned int s1 = *(sptr_elt+1);						\
+				unsigned int s2 = *(sptr_elt+2);						\
+				unsigned int s3 = *(sptr_elt+3);						\
+				unsigned int d0 = *(dptr_elt+0);						\
+				unsigned int d1 = *(dptr_elt+1);						\
+				unsigned int d2 = *(dptr_elt+2);						\
+				unsigned int d3 = *(dptr_elt+3);						\
+				TOP4(d0,d1,d2,d3,s0,s1,s2,s3,s0,s1,s2,s3);				\
+				*(dptr_elt+0) = d0;										\
+				*(dptr_elt+1) = d1;										\
+				*(dptr_elt+2) = d2;										\
+				*(dptr_elt+3) = d3;										\
+				sptr_elt += 4;											\
+				dptr_elt += 4;											\
+			}															\
+			for ( ; i < wi ; i++) {										\
+				TOP(*dptr_elt, *sptr_elt, *sptr_elt);					\
+				sptr_elt ++;											\
+				dptr_elt ++;											\
+			}															\
+			sptr_line += dst_stride;									\
+			dptr_line += dst_stride;									\
+		}																\
+	}
+	
+BLITS32D32_FWD_FWD(over, TROVERh, TROVERh4)
+BLITS32D32_FWD_FWD(fover, TRFOVERh, TRFOVERh4)
+
--- a/nubus-to-ztex-gateware/goblin_accel.py
+++ b/nubus-to-ztex-gateware/goblin_accel.py
@ -20,11 +20,11 @@ class GoblinAccel(Module): # AutoCSR ?
        reg_status = Signal(32) # 0
        reg_cmd = Signal(32) # 1
        reg_r5_cmd = Signal(32) # 2, to communicate with Vex
-        # 3 resv0
+        reg_op = Signal(8) # 3, X11 op or (0x80 | Render op)
        reg_width = Signal(COORD_BITS) # 4
        reg_height = Signal(COORD_BITS) # 5
        reg_fgcolor = Signal(32) # 6
-        # 7 resv2
+        reg_depth = Signal(8) # 7, depth (0 is native)
        reg_bitblt_src_x = Signal(COORD_BITS) # 8
        reg_bitblt_src_y = Signal(COORD_BITS) # 9
        reg_bitblt_dst_x = Signal(COORD_BITS) # 10
@ -34,17 +34,26 @@ class GoblinAccel(Module): # AutoCSR ?
        reg_src_ptr = Signal(32) # 14
        reg_dst_ptr = Signal(32) # 15
        
+        reg_bitblt_msk_x = Signal(COORD_BITS) # 16
+        reg_bitblt_msk_y = Signal(COORD_BITS) # 17
+        reg_msk_stride = Signal(COORD_BITS) # 18
+        reg_msk_ptr = Signal(32) # 19
+        
        # do-some-work flags
        do_blit = Signal()
        do_fill = Signal()
        do_patt = Signal()
-        do_test = Signal()
+        do_rsmsk8dst32 = Signal()
+        do_rsrc32msk32dst32 = Signal()
+        do_rsrc32dst32 = Signal()

        # cmd register reg_cmd
        DO_BLIT_BIT = 0
        DO_FILL_BIT = 1
        DO_PATT_BIT = 2
-        DO_TEST_BIT = 3
+        DO_RSMSK8DST32_BIT = 3
+        DO_RSRC32MSK32DST32_BIT = 4
+        DO_RSRC32DST32_BIT = 5
        
        # global status register reg_status
        WORK_IN_PROGRESS_BIT = 0
@ -83,14 +92,16 @@ class GoblinAccel(Module): # AutoCSR ?
                                      NextValue(do_blit, bus_dat_w_endian[DO_BLIT_BIT] & ~reg_status[WORK_IN_PROGRESS_BIT]),
                                      NextValue(do_fill, bus_dat_w_endian[DO_FILL_BIT] & ~reg_status[WORK_IN_PROGRESS_BIT]),
                                      NextValue(do_patt, bus_dat_w_endian[DO_PATT_BIT] & ~reg_status[WORK_IN_PROGRESS_BIT]),
-                                      NextValue(do_test, bus_dat_w_endian[DO_TEST_BIT] & ~reg_status[WORK_IN_PROGRESS_BIT]),
+                                      NextValue(do_rsmsk8dst32,      bus_dat_w_endian[DO_RSMSK8DST32_BIT]      & ~reg_status[WORK_IN_PROGRESS_BIT]),
+                                      NextValue(do_rsrc32msk32dst32, bus_dat_w_endian[DO_RSRC32MSK32DST32_BIT] & ~reg_status[WORK_IN_PROGRESS_BIT]),
+                                      NextValue(do_rsrc32dst32,      bus_dat_w_endian[DO_RSRC32DST32_BIT]      & ~reg_status[WORK_IN_PROGRESS_BIT]),
                                ],
                                2:  [ NextValue(reg_r5_cmd, bus_dat_w_endian) ],
-                                # 3
+                                3:  [ NextValue(reg_op, bus_dat_w_endian) ],
                                4:  [ NextValue(reg_width, bus_dat_w_endian) ],
                                5:  [ NextValue(reg_height, bus_dat_w_endian) ],
                                6:  [ NextValue(reg_fgcolor, bus_dat_w_endian) ],
-                                # 7
+                                7:  [ NextValue(reg_depth, bus_dat_w_endian) ],
                                8:  [ NextValue(reg_bitblt_src_x, bus_dat_w_endian) ],
                                9:  [ NextValue(reg_bitblt_src_y, bus_dat_w_endian) ],
                                10: [ NextValue(reg_bitblt_dst_x, bus_dat_w_endian) ],
@ -99,6 +110,11 @@ class GoblinAccel(Module): # AutoCSR ?
                                13: [ NextValue(reg_dst_stride, bus_dat_w_endian) ],
                                14: [ NextValue(reg_src_ptr, bus_dat_w_endian) ],
                                15: [ NextValue(reg_dst_ptr, bus_dat_w_endian) ],
+                                
+                                16: [ NextValue(reg_bitblt_msk_x, bus_dat_w_endian) ],
+                                17: [ NextValue(reg_bitblt_msk_y, bus_dat_w_endian) ],
+                                18: [ NextValue(reg_msk_stride, bus_dat_w_endian) ],
+                                19: [ NextValue(reg_msk_ptr, bus_dat_w_endian) ],
                            }),
                            NextValue(bus.ack, 1),
                            ).Elif(bus.cyc & bus.stb & ~bus.we & ~bus.ack, #read
@ -107,11 +123,11 @@ class GoblinAccel(Module): # AutoCSR ?
                                       0:  [ NextValue(bus_dat_r_endian, reg_status) ],
                                       1:  [ NextValue(bus_dat_r_endian, reg_cmd) ],
                                       2:  [ NextValue(bus_dat_r_endian, reg_r5_cmd) ],
-                                       # 3
+                                       3:  [ NextValue(bus_dat_r_endian, reg_op) ],
                                       4:  [ NextValue(bus_dat_r_endian, reg_width) ],
                                       5:  [ NextValue(bus_dat_r_endian, reg_height) ],
                                       6:  [ NextValue(bus_dat_r_endian, reg_fgcolor) ],
-                                       # 7
+                                       7:  [ NextValue(bus_dat_r_endian, reg_depth) ],
                                       8:  [ NextValue(bus_dat_r_endian, reg_bitblt_src_x) ],
                                       9:  [ NextValue(bus_dat_r_endian, reg_bitblt_src_y) ],
                                       10: [ NextValue(bus_dat_r_endian, reg_bitblt_dst_x) ],
@ -120,6 +136,11 @@ class GoblinAccel(Module): # AutoCSR ?
                                       13: [ NextValue(bus_dat_r_endian, reg_dst_stride) ],
                                       14: [ NextValue(bus_dat_r_endian, reg_src_ptr) ],
                                       15: [ NextValue(bus_dat_r_endian, reg_dst_ptr) ],
+                                       
+                                       16: [ NextValue(bus_dat_r_endian, reg_bitblt_msk_x) ],
+                                       17: [ NextValue(bus_dat_r_endian, reg_bitblt_msk_y) ],
+                                       18: [ NextValue(bus_dat_r_endian, reg_msk_stride) ],
+                                       19: [ NextValue(bus_dat_r_endian, reg_msk_ptr) ],
                                   }),
                                   NextValue(bus.ack, 1),
                            ).Else(
@ -132,7 +153,9 @@ class GoblinAccel(Module): # AutoCSR ?
        FUN_BLIT_BIT = 0
        FUN_FILL_BIT = 1
        FUN_PATT_BIT = 2
-        FUN_TEST_BIT = 3
+        FUN_RSMSK8DST32_BIT = 3
+        FUN_RSRC32MSK32DST32_BIT = 4
+        FUN_RSRC32DST32_BIT = 5
        # to hold the Vex in reset
        self.local_reset = local_reset = Signal(reset = 1)

@ -160,9 +183,21 @@ class GoblinAccel(Module): # AutoCSR ?
                   reg_status[WORK_IN_PROGRESS_BIT].eq(1),
                   local_reset.eq(0),
                   #timeout.eq(timeout_rst),
-            ).Elif(do_test & ~reg_status[WORK_IN_PROGRESS_BIT],
-                   do_test.eq(0),
-                   reg_r5_cmd[FUN_TEST_BIT].eq(1),
+            ).Elif(do_rsmsk8dst32 & ~reg_status[WORK_IN_PROGRESS_BIT],
+                   do_rsmsk8dst32.eq(0),
+                   reg_r5_cmd[FUN_RSMSK8DST32_BIT].eq(1),
+                   reg_status[WORK_IN_PROGRESS_BIT].eq(1),
+                   local_reset.eq(0),
+                   #timeout.eq(timeout_rst),
+            ).Elif(do_rsrc32msk32dst32 & ~reg_status[WORK_IN_PROGRESS_BIT],
+                   do_rsrc32msk32dst32.eq(0),
+                   reg_r5_cmd[FUN_RSRC32MSK32DST32_BIT].eq(1),
+                   reg_status[WORK_IN_PROGRESS_BIT].eq(1),
+                   local_reset.eq(0),
+                   #timeout.eq(timeout_rst),
+            ).Elif(do_rsrc32dst32 & ~reg_status[WORK_IN_PROGRESS_BIT],
+                   do_rsrc32dst32.eq(0),
+                   reg_r5_cmd[FUN_RSRC32DST32_BIT].eq(1),
                   reg_status[WORK_IN_PROGRESS_BIT].eq(1),
                   local_reset.eq(0),
                   #timeout.eq(timeout_rst),
--- a/nubus-to-ztex-gateware/ldsdsupport.h
+++ b/nubus-to-ztex-gateware/ldsdsupport.h
@ -72,8 +72,8 @@ asm(".set regnum_t6  , 31");
 				 : "r" (base)											\
 				 );														\
 	
-#define ld(base, imm12, o1, o2) opcode_ld(0x03, 0x03, base, imm12, o1, o2)
-#define ldu(base, imm12, o1, o2) opcode_ld(0x03, 0x07, base, imm12, o1, o2)
+#define _custom_ld(base, imm12, o1, o2) opcode_ld(0x03, 0x03, base, imm12, o1, o2)
+#define _custom_ldu(base, imm12, o1, o2) opcode_ld(0x03, 0x07, base, imm12, o1, o2)

 #define opcode_sd(opcode, func3, base, imm04, imm511, i1, i2)			\
 	asm volatile(".word ((" #opcode ") | (" #imm04 " << 7) | (regnum_%0 << 15) | (regnum_%1 << 20) | (" #imm511 " << 25) | ((" #func3 ") << 12));" \
@ -81,4 +81,43 @@ asm(".set regnum_t6  , 31");
 				 : "r" (base), "r" (i1), "r" (i2)						\
 				 );														\
 
-#define sd(base, imm04, imm511, i1, i2) opcode_sd(0x23, 0x03, base, imm04, imm511, i1, i2)
+#define _custom_sd(base, imm04, imm511, i1, i2) opcode_sd(0x23, 0x03, base, imm04, imm511, i1, i2)
+
+
+#define opcode_p(opcode, func3, func7, rd, rs1, rs2)					\
+	asm volatile(".word ((" #opcode ") | (regnum_%0 << 7) | (regnum_%1 << 15) | (regnum_%2 << 20) | ((" #func3 ") << 12) | ((" #func7 ") << 25));" \
+				 : "=r" (rd)											\
+				 : "r" (rs1), "r" (rs2)									\
+				 );
+#define opcode_p_ter(opcode, func3, func7, rd, rs1, rs2)					\
+	asm volatile(".word ((" #opcode ") | (regnum_%0 << 7) | (regnum_%1 << 15) | (regnum_%2 << 20) | ((" #func3 ") << 12) | ((" #func7 ") << 25));" \
+				 : "+r" (rd)											\
+				 : "r" (rs1), "r" (rs2)									\
+				 );
+
+#define _ukadd8(rd, rs1, rs2)      opcode_p(0x00000077, 0x00, 0x1c, rd, rs1, rs2)
+#define _uksub8(rd, rs1, rs2)      opcode_p(0x00000077, 0x00, 0x1d, rd, rs1, rs2)
+#define _ufma8vhv(rd, rs1, rs2)      opcode_p_ter(0x00000077, 0x00, 0x64, rd, rs1, rs2)
+#define _ufma8vlv(rd, rs1, rs2)      opcode_p_ter(0x00000077, 0x00, 0x66, rd, rs1, rs2)
+
+static inline unsigned int ukadd8(const unsigned int a, const unsigned int b) {
+	unsigned int r;
+	_ukadd8(r, a, b);
+	return r;
+}
+static inline unsigned int uksub8(const unsigned int a, const unsigned int b) {
+	unsigned int r;
+	_uksub8(r, a, b);
+	return r;
+}
+
+static inline unsigned int ufma8vhv(const unsigned int a, const unsigned int b, const unsigned int c) {
+	unsigned int r = c;
+	_ufma8vhv(r, a, b);
+	return r;
+}
+static inline unsigned int ufma8vlv(const unsigned int a, const unsigned int b, const unsigned int c) {
+	unsigned int r = c;
+	_ufma8vlv(r, a, b);
+	return r;
+}