LD/LDU/SD (64-bits, dual regs) support in Vex + accel ; ramdisk tested in Q650

This commit is contained in:
Romain Dolbeau 2022-06-24 23:37:18 +02:00
parent 2d2cbdbafe
commit 173c87ea02
14 changed files with 1711 additions and 999 deletions

View File

@ -11,3 +11,4 @@ blit.raw
blit.s
*.patch
OLD
nubusfpga_csr_*.h

View File

@ -0,0 +1,7 @@
vid_decl_rom.bin
vid_decl_rom.dir
vid_decl_rom.l
vid_decl_rom.o
vid_decl_rom.raw
vid_decl_rom.srec
*.bin

View File

@ -93,6 +93,7 @@ vid_decl_rom.dir: vid_decl_rom.raw append_romdir
vid_decl_rom.bin: vid_decl_rom.dir
${NUBUS_CHECKSUM} --input_file $< --output_file $@ --output_size 32768
dd if=dump.cpr of=vid_decl_rom.bin bs=1 conv=notrunc
clean:
rm -f res.inc ${CSRC_ASM} *.o vid_decl_rom.srec vid_decl_rom.raw vid_decl_rom.dir vid_decl_rom.l

View File

@ -17,7 +17,9 @@
#warning "Using default VRES"
#endif
#define GOBOFB_BASE 0x00900000
#define GOBOFB_BASE 0x00900000
#define GOBOFB_ACCEL 0x00901000
#define GOBOFB_ACCEL_LE 0x00901800
//#define GOBOFB_REG_BASE 0x00900000
//#define GOBOFB_MEM_BASE 0x00000000 /* remapped to 0x8f800000 by HW */
@ -44,6 +46,34 @@
#define GOBOFB_MODE_24BIT 0x10
#define GOBOFB_MODE_15BIT 0x11
#define u_int32_t volatile unsigned long
struct goblin_accel_regs {
u_int32_t reg_status; // 0
u_int32_t reg_cmd;
u_int32_t reg_r5_cmd;
u_int32_t resv0;
u_int32_t reg_width; // 4
u_int32_t reg_height;
u_int32_t reg_fgcolor;
u_int32_t resv2;
u_int32_t reg_bitblt_src_x; // 8
u_int32_t reg_bitblt_src_y;
u_int32_t reg_bitblt_dst_x;
u_int32_t reg_bitblt_dst_y;
u_int32_t reg_src_stride; // 12
u_int32_t reg_dst_stride;
u_int32_t reg_src_ptr; // 12
u_int32_t reg_dst_ptr;
};
// status
#define WORK_IN_PROGRESS_BIT 0
// cmd
#define DO_BLIT_BIT 0 // hardwired in goblin_accel.py
#define DO_FILL_BIT 1 // hardwired in goblin_accel.py
#define DO_TEST_BIT 3 // hardwired in goblin_accel.py
struct MyGammaTbl {
short gVersion; /*gamma version number*/
short gType; /*gamma data type*/

View File

@ -276,10 +276,12 @@ OSErr cNuBusFPGACtl(CntrlParamPtr pb, /* DCtlPtr */ AuxDCEPtr dce)
UInt32 a32_4p0, a32_4p1;
const uint32_t wb = HRES >> idx;
unsigned short j, i;
if (vPInfo->csPage != 0)
return paramErr;
SwapMMUMode ( &busMode );
#if 0
if ((dStore->curMode != kDepthMode5) && (dStore->curMode != kDepthMode6)) {
/* grey the screen */
a32_l0 = a32;
@ -313,6 +315,25 @@ OSErr cNuBusFPGACtl(CntrlParamPtr pb, /* DCtlPtr */ AuxDCEPtr dce)
a32_l1 += 2*HRES*4;
}
}
#else
#define WAIT_FOR_HW_LE(accel_le) \
while (accel_le->reg_status & (1<<WORK_IN_PROGRESS_BIT))
const UInt32 fgcolor = 0; // FIXME: per-depth?
struct goblin_accel_regs* accel_le = (struct goblin_accel_regs*)(dce->dCtlDevBase+GOBOFB_ACCEL_LE);
WAIT_FOR_HW_LE(accel_le);
accel_le->reg_width = HRES; // pixels
accel_le->reg_height = VRES;
accel_le->reg_bitblt_dst_x = 0; // pixels
accel_le->reg_bitblt_dst_y = 0;
accel_le->reg_dst_ptr = 0;
accel_le->reg_fgcolor = fgcolor;
accel_le->reg_cmd = (1<<DO_FILL_BIT);
WAIT_FOR_HW_LE(accel_le);
#undef WAIT_FOR_HW_LE
#endif
SwapMMUMode ( &busMode );
ret = noErr;

View File

@ -24,6 +24,7 @@ UInt32 Primary(SEBlock* seblock) {
/* PRIM_WRITEREG(GOBOFB_DEBUG, busMode);// trace */
/* grey the screen */
/* should switch to HW ? */
a32_l0 = a32;
a32_l1 = a32 + HRES;
for (j = 0 ; j < VRES ; j+= 2) {

View File

@ -12,11 +12,6 @@
struct RAMDrvContext {
DrvSts2 drvsts;
//Ptr origcopyfunc;
//Ptr origdisk; /* keep unstripped pointers for Dispose*/
//unsigned char * disk;
//char initialized;
//char alreadyalloced;
};
#define DRIVE_SIZE_BYTES ((256ul-8ul)*1024ul*1024ul) // FIXME: mem size minus fb size

View File

@ -6,22 +6,27 @@
OSErr cNuBusFPGARAMDskOpen(IOParamPtr pb, /* DCtlPtr */ AuxDCEPtr dce)
{
DrvSts2 *dsptr; // pointer to the DrvSts2 in our context
DrvQElPtr dq;
int drvnum = 1;
struct RAMDrvContext *ctx;
OSErr ret = noErr;
char busMode;
busMode = 1;
SwapMMUMode ( &busMode ); // to32 // this likely won't work on older MacII ???
dce->dCtlDevBase = 0xfc000000;
dce->dCtlDevBase = 0xfc000000; // FIXME: why do we not get our slot properly ?
write_reg(dce, GOBOFB_DEBUG, 0xDEAD0000);
/* write_reg(dce, GOBOFB_DEBUG, dce->dCtlRefNum); */
if (dce->dCtlStorage == nil) {
DrvQElPtr dq;
for(dq = (DrvQElPtr)(GetDrvQHdr())->qHead; dq; dq = (DrvQElPtr)dq->qLink) {
if (dq->dQDrive >= drvnum)
drvnum = dq->dQDrive+1;
}
ReserveMemSys(sizeof(struct RAMDrvContext));
dce->dCtlStorage = NewHandleSysClear(sizeof(struct RAMDrvContext));
if (dce->dCtlStorage == nil) {
ret = openErr;
@ -65,7 +70,7 @@ OSErr cNuBusFPGARAMDskOpen(IOParamPtr pb, /* DCtlPtr */ AuxDCEPtr dce)
write_reg(dce, GOBOFB_DEBUG, compressed[2]);
write_reg(dce, GOBOFB_DEBUG, compressed[3]);
*/
res = rledec(superslot, compressed, 730);
res = rledec(superslot, compressed, 730); // FIXME: 730 = 2920/4 (compressed size in words)
/*
write_reg(dce, GOBOFB_DEBUG, res);
write_reg(dce, GOBOFB_DEBUG, 0xDEEEEEAD);
@ -76,6 +81,7 @@ OSErr cNuBusFPGARAMDskOpen(IOParamPtr pb, /* DCtlPtr */ AuxDCEPtr dce)
MyAddDrive(dsptr->dQRefNum, drvnum, (DrvQElPtr)&dsptr->qLink);
}
SwapMMUMode ( &busMode );
done:
return ret;

View File

@ -4,77 +4,82 @@
#ifndef SKIP_MAIN
uint32_t rleenc(uint32_t* out, const uint32_t* in, const uint32_t len) {
uint32_t i = 0, j = 0, p = 0, ib, k;
int32_t c = 0;
uint32_t i = 0, j = 0, p = 0, ib, k;
int32_t c = 0;
p = in[0];
p = in[0];
for (i = 1 ; i < len ; i++) {
if (c == 0) { // just started
if (in[i] == p) { // repeat
c++;
} else { // non-repeat
p = in[i];
c--;
ib = i - 1;
}
} else if (c > 0) { // in-repeat
if (in[i] == p) { // keep repeating
c++;
} else { // exit repeat
out[j++] = __builtin_bswap32(c); // write result
for (i = 1 ; i < len ; i++) {
if (c == 0) { // just started
if (in[i] == p) { // repeat
c++;
} else { // non-repeat
p = in[i];
c--;
ib = i - 1;
}
} else if (c > 0) { // in-repeat
if (in[i] == p) { // keep repeating
c++;
} else { // exit repeat
out[j++] = __builtin_bswap32(c); // write result
out[j++] = p;
p = in[i]; // restart
c = 0;
}
} else { // c < 0
if (in[i] == p) { // exit non-repeat
out[j++] = __builtin_bswap32(c+1); // write result, removing previous
for (k = 0 ; k < (-c) ; k++)
out[j++] = in[ib+k];
p = in[i]; // restart
c = 1; // this and previous
} else { // non-repeat
p = in[i];
c--;
}
}
}
out[j++] = __builtin_bswap32(c);
out[j++] = p;
p = in[i]; // restart
c = 0;
}
} else { // c < 0
if (in[i] == p) { // exit non-repeat
out[j++] = __builtin_bswap32(c+1); // write result, removing previous
for (k = 0 ; k < (-c) ; k++)
out[j++] = in[ib+k];
p = in[i]; // restart
c = 1; // this and previous
} else { // non-repeat
p = in[i];
c--;
}
}
}
out[j++] = __builtin_bswap32(c);
out[j++] = p;
return j;
return j;
}
#endif
uint32_t rledec(uint32_t* out, const uint32_t* in, const uint32_t len) {
uint32_t i = 0, j = 0, k = 0, chk = 0, ib;
uint32_t i = 0, j = 0, k = 0, chk = 0, ib;
for (i = 0 ; i < len ; ) {
for (i = 0 ; i < len ; ) {
#ifndef __m68k__
int32_t c = (int32_t)__builtin_bswap32(in[i]);
int32_t c = (int32_t)__builtin_bswap32(in[i]);
#else
int32_t c = (int32_t)(in[i]);
int32_t c = (int32_t)(in[i]);
#endif
if (c >= 0) {
chk += (1 + c);
if (c < 10000) // !!!!!!!!!!!!!!!!!!!!!!!!!!
for (k = 0 ; k < c+1 ; k++)
out[j++] = in[i+1];
else
j += c+1;
i += 2;
} else {
chk += 1 + -c;
for (k = 0 ; k < 1 + -c ; k++)
out[j++] = in[i+1+k];
i += 2 + -c;
}
//fprintf(stderr, "%u: %u <> %u (%d, 0x%08x)\n", i, j, chk, c, in[i+1]);
}
return j;
if (c >= 0) {
chk += (1 + c);
if (c < 300000) { // !!!!!!!!!!!!!!!!!!!!!!!!!!
for (k = 0 ; k < (c + 1) ; k++)
out[j++] = in[i+1];
} else { // do a small subset at the beginning and end instead of the full range and assume this is padding otherwise
for (k = 0 ; k < 4 ; k++)
out[j+k] = in[i+1];
for (k = c-3 ; k < (c + 1) ; k++)
out[j+k] = in[i+1];
j += c+1;
}
i += 2;
} else {
chk += (1 + -c);
for (k = 0 ; k < (1 + -c) ; k++)
out[j++] = in[i+1+k];
i += 2 + -c;
}
//fprintf(stderr, "%u: %u <> %u (%d, 0x%08x)\n", i, j, chk, c, in[i+1]);
}
return j;
}
#ifndef SKIP_MAIN
@ -87,58 +92,58 @@ uint32_t rledec(uint32_t* out, const uint32_t* in, const uint32_t len) {
#include <unistd.h>
int main(int argc, char** argv) {
int fd;
uint32_t len, k;
uint32_t *bufa, *bufb;
FILE* f;
int fd;
uint32_t len, k;
uint32_t *bufa, *bufb;
FILE* f;
bufa = calloc(sizeof(uint32_t), 256*1024*1024/sizeof(uint32_t));
bufb = calloc(sizeof(uint32_t), 256*1024*1024/sizeof(uint32_t));
bufa = calloc(sizeof(uint32_t), 256*1024*1024/sizeof(uint32_t));
bufb = calloc(sizeof(uint32_t), 256*1024*1024/sizeof(uint32_t));
fd = open("dump.raw", O_RDONLY);
len = read(fd, bufa, 248*1024*1024ull) / 4;
close(fd);
fd = open("dump.raw", O_RDONLY);
len = read(fd, bufa, 248*1024*1024ull) / 4;
close(fd);
printf("File : %d bytes\n", len*4);
printf("File : %d bytes\n", len*4);
len = rleenc(bufb, bufa, len);
len = rleenc(bufb, bufa, len);
printf("Compressed : %d bytes\n", len*4);
printf("Compressed : %d bytes\n", len*4);
/* for (k = 0 ; k < len ; k++) */
/* bufb[k] = __builtin_bswap32(bufb[k]); */
/* for (k = 0 ; k < len ; k++) */
/* bufb[k] = __builtin_bswap32(bufb[k]); */
fd = open("dump.cpr", O_WRONLY | O_CREAT, S_IRWXU);
/* len = */write (fd, bufb, len*4);
close(fd);
fd = open("dump.cpr", O_WRONLY | O_CREAT, S_IRWXU);
/* len = */write (fd, bufb, len*4);
close(fd);
/* for (k = 0 ; k < len ; k++) */
/* bufb[k] = __builtin_bswap32(bufb[k]); */
/* for (k = 0 ; k < len ; k++) */
/* bufb[k] = __builtin_bswap32(bufb[k]); */
f = fopen("dump_cpr.c", "w");
/* fprintf(f, "unsigned char* compressed[%d] = {\n", len*4); */
/* for (k = 0 ; k < len*4 ; k++) { */
/* fprintf(f, "0x%02x%s", ((unsigned char*)bufb)[k], */
/* k == (len*4-1) ? "};" : (k%16 == 15 ? ",\n" : ",") */
/* ); */
/* } */
fprintf(f, "unsigned long* compressed[%d] = {\n", len);
for (k = 0 ; k < len ; k++) {
fprintf(f, "0x%08x%s", bufb[k],
k == (len-1) ? "};" : (k%8 == 7 ? ",\n" : ",")
);
}
fclose(f);
f = fopen("dump_cpr.c", "w");
/* fprintf(f, "unsigned char* compressed[%d] = {\n", len*4); */
/* for (k = 0 ; k < len*4 ; k++) { */
/* fprintf(f, "0x%02x%s", ((unsigned char*)bufb)[k], */
/* k == (len*4-1) ? "};" : (k%16 == 15 ? ",\n" : ",") */
/* ); */
/* } */
fprintf(f, "unsigned long* compressed[%d] = {\n", len);
for (k = 0 ; k < len ; k++) {
fprintf(f, "0x%08x%s", bufb[k],
k == (len-1) ? "};" : (k%8 == 7 ? ",\n" : ",")
);
}
fclose(f);
len = rledec(bufa, bufb, len);
len = rledec(bufa, bufb, len);
printf("Uncompressed : %d bytes\n", len*4);
printf("Uncompressed : %d bytes\n", len*4);
fd = open("dump.ucp", O_WRONLY | O_CREAT, S_IRWXU);
len = write (fd, bufa, len*4);
close(fd);
fd = open("dump.ucp", O_WRONLY | O_CREAT, S_IRWXU);
len = write (fd, bufa, len*4);
close(fd);
return 0;
return 0;
}
#endif

View File

@ -19,12 +19,12 @@ object GenGoblinAccel { // extends App {
val config = VexRiscvConfig(
plugins = List(
new IBusCachedPlugin(
resetVector = 0x70910000, // beginning of ROM
resetVector = 0xF0910000l, // beginning of ROM
relaxedPcCalculation = false,
prediction = STATIC,
config = InstructionCacheConfig(
cacheSize = 512,
bytePerLine = 32,
cacheSize = 256,
bytePerLine = 16,
wayCount = 1,
addressWidth = 32,
cpuDataWidth = 32,
@ -42,8 +42,8 @@ object GenGoblinAccel { // extends App {
// ),
new DBusCachedPlugin(
config = new DataCacheConfig(
cacheSize = 512,
bytePerLine = 32,
cacheSize = 256,
bytePerLine = 16,
wayCount = 2,
addressWidth = 32,
cpuDataWidth = 128,
@ -51,7 +51,7 @@ object GenGoblinAccel { // extends App {
catchAccessError = false,
catchIllegal = false,
catchUnaligned = false,
pendingMax = 8, // 64
pendingMax = 8, // 64 ; irrelevant? only for SMP?
withWriteAggregation = true // required if memDataWidth > 32
),
dBusCmdMasterPipe = false, // prohibited if memDataWidth > 32
@ -68,8 +68,8 @@ object GenGoblinAccel { // extends App {
new DecoderSimplePlugin(
catchIllegalInstruction = false
),
new RegFilePlugin(
regFileReadyKind = plugin.SYNC,
new RegFileOddEvenPlugin(
regFileReadyKind = plugin.ASYNC, // FIXME why is even-odd failing with SYNC??? (and what's the difference...)
zeroBoot = false
),
new IntAluPlugin,
@ -83,7 +83,7 @@ object GenGoblinAccel { // extends App {
//new BitManipZbaPlugin(earlyInjection = false), // sh.add
//new BitManipZbbPlugin(earlyInjection = false), // zero-ext, min/max, others
//new BitManipZbtPlugin(earlyInjection = false), // cmov, cmix, funnel
new CG6Plugin(earlyInjection = false),
new CG6Plugin(earlyInjection = false), // full-custom list
new HazardSimplePlugin(
bypassExecute = true,
bypassMemory = true,

File diff suppressed because it is too large Load Diff

View File

@ -81,6 +81,8 @@ struct goblin_accel_regs {
//#include "./rvintrin.h"
#include "ldsdsupport.h"
void from_reset(void) __attribute__ ((noreturn)); // nothrow,
static inline void flush_cache(void) {
@ -376,6 +378,17 @@ static void rectfill(const unsigned_param_type xd,
}
if (wi > 3) {
unsigned int u32color = (unsigned int)u8color | ((unsigned int)u8color)<<8 | ((unsigned int)u8color)<<16 | ((unsigned int)u8color)<<24;
if ((wi>15) && (((unsigned int)dptr_elt&0x7)==0)) {
register unsigned int s8 asm("s8");
register unsigned int s9 asm("s9");
s8 = u32color;
s9 = u32color;
for ( ; i < (wi-15) ; i+=16) {
sd(dptr_elt, 0, 0, s8, s9);
sd(dptr_elt, 8, 0, s8, s9);
dptr_elt += 16;
}
}
for ( ; i < (wi-3) ; i+=4) {
*(unsigned int*)dptr_elt = u32color;
dptr_elt +=4;
@ -732,10 +745,139 @@ static void invert(const unsigned_param_type xd,
BLIT_FWD_FWD(NAME, OP) \
BLIT_FWD_BWD(NAME, OP) \
BLIT_BWD_FWD(NAME, OP) \
#define BLIT_NOTALLDIR(NAME, OP) \
BLIT_FWD_BWD(NAME, OP) \
BLIT_BWD_FWD(NAME, OP) \
BLIT_ALLDIR(copy, COPY)
//BLIT_ALLDIR(copy, COPY)
BLIT_NOTALLDIR(copy, COPY)
BLIT_ALLDIR(xor, XOR)
BLIT_ALLDIR(copy_pm, COPY_PM)
BLIT_ALLDIR(xor_pm, XOR_PM)
static void bitblit_fwd_fwd_copy(const unsigned_param_type xs,
const unsigned_param_type ys,
const unsigned_param_type wi,
const unsigned_param_type re,
const unsigned_param_type xd,
const unsigned_param_type yd,
const unsigned char pm,
unsigned char* src_ptr,
unsigned char* dst_ptr,
const unsigned_param_type src_stride,
const unsigned_param_type dst_stride) {
unsigned int j;
unsigned char *sptr = (src_ptr + (ys * src_stride) + xs);
unsigned char *dptr = (dst_ptr + (yd * dst_stride) + xd);
unsigned char *sptr_line = sptr;
unsigned char *dptr_line = dptr;
/*const unsigned char npm = ~pm;*/
for (j = 0 ; j < re ; j++) {
register unsigned char *sptr_elt = sptr_line;
unsigned char *dptr_elt = dptr_line;
const unsigned char *dptr_elt_last = dptr_line + wi;
if (wi>3) {
if ((xs & 0x3) != (xd & 0x3)) {
/* align dest, we'll deal with src via shift realignement using fsr */
for ( ; (dptr_elt < dptr_elt_last) && ((unsigned int)dptr_elt&0x3)!=0; ) {
dptr_elt[0] = sptr_elt[0];
dptr_elt ++;
sptr_elt ++;
}
unsigned char *sptr_elt_al = (unsigned char*)((unsigned int)sptr_elt & ~0x3);
unsigned int fsr_cst = 8*((unsigned int)sptr_elt & 0x3);
unsigned int src0 = ((unsigned int*)sptr_elt_al)[0];
unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24;
/* handle unaligned src */
for ( ; (dptr_elt < (dptr_elt_last-3)) ; ) {
unsigned int src1 = ((unsigned int*)sptr_elt_al)[1];
unsigned int val;
asm("fsr %0, %1, %2, %3\n" : "=r"(val) : "r"(src0), "r"(src1), "r"(fsr_cst));
((unsigned int*)dptr_elt)[0] = val;
src0 = src1;
dptr_elt += 4;
sptr_elt_al += 4;
}
sptr_elt = sptr_elt_al + ((unsigned int)sptr_elt & 0x3);
} else {
const unsigned int u32pm = (unsigned int)pm | ((unsigned int)pm)<<8 | ((unsigned int)pm)<<16 | ((unsigned int)pm)<<24;
const unsigned char* dptr_elt_end = dptr_elt + wi;
/* align dest & src (they are aligned the same here) */
for ( ; (dptr_elt < dptr_elt_last) && ((unsigned int)dptr_elt&0x3)!=0; ) {
dptr_elt[0] = sptr_elt[0];
dptr_elt ++;
sptr_elt ++;
}
/* align to 8 for ls/sd */
for ( ; (dptr_elt < (dptr_elt_last-3)) && ((unsigned int)dptr_elt&0x7)!=0;) {
((unsigned int*)dptr_elt)[0] = ((unsigned int*)sptr_elt)[0];
dptr_elt += 4;
sptr_elt += 4;
}
#if 0
for ( ; (dptr_elt < (dptr_elt_last-31)) ; ) {
register unsigned int s4 asm("s4");
register unsigned int s5 asm("s5");
register unsigned int s6 asm("s6");
register unsigned int s7 asm("s7");
register unsigned int s8 asm("s8");
register unsigned int s9 asm("s9");
register unsigned int s10 asm("s10");
register unsigned int s11 asm("s11");
ld(sptr_elt, 0, s4, s5);
ld(sptr_elt, 16, s8, s9);
ld(sptr_elt, 8, s6, s7);
sd(dptr_elt, 0, 0, s4, s5);
sd(dptr_elt, 8, 0, s6, s7);
ld(sptr_elt, 24, s10, s11);
sd(dptr_elt, 16, 0, s8, s9);
sptr_elt += 32;
sd(dptr_elt, 24, 0, s10, s11);
dptr_elt += 32;
}
#endif
for ( ; (dptr_elt < (dptr_elt_last-15)) ; ) {
register unsigned int s8 asm("s8");
register unsigned int s9 asm("s9");
register unsigned int s10 asm("s10");
register unsigned int s11 asm("s11");
ld(sptr_elt, 0, s8, s9);
ld(sptr_elt, 8, s10, s11);
sd(dptr_elt, 0, 0, s8, s9);
sptr_elt += 16;
sd(dptr_elt, 8, 0, s10, s11);
dptr_elt += 16;
}
#if 0
for ( ; (dptr_elt < (dptr_elt_last-7)) ; ) {
register unsigned int s8 asm("s8");
register unsigned int s9 asm("s9");
ld(sptr_elt, 0, s8, s9);
sd(dptr_elt, 0, 0, s8, s9);
sptr_elt += 8;
dptr_elt += 8;
}
#endif
for ( ; (dptr_elt < (dptr_elt_last-3)) ; ) {
((unsigned int*)dptr_elt)[0] = ((unsigned int*)sptr_elt)[0];
dptr_elt += 4;
sptr_elt += 4;
}
}
}
/* common tail loop */
for ( ; dptr_elt < dptr_elt_last ; ) {
dptr_elt[0] = sptr_elt[0];
dptr_elt ++;
sptr_elt ++;
}
sptr_line += src_stride;
dptr_line += dst_stride;
}
}

View File

@ -15,7 +15,7 @@ GCCPFX=riscv32-buildroot-linux-gnu-
GCC=${GCCDIR}/bin/${GCCPFX}gcc
OBJCOPY=${GCCDIR}/bin/${GCCPFX}objcopy
OPT=-Os #-fno-inline
OPT=-O3 #-fno-inline
ARCH=rv32im_zba_zbb_zbt
PARAM="-DBASE_FB=${BASE_FB}"

View File

@ -0,0 +1,84 @@
#pragma once
asm(".set regnum_x0 , 0");
asm(".set regnum_x1 , 1");
asm(".set regnum_x2 , 2");
asm(".set regnum_x3 , 3");
asm(".set regnum_x4 , 4");
asm(".set regnum_x5 , 5");
asm(".set regnum_x6 , 6");
asm(".set regnum_x7 , 7");
asm(".set regnum_x8 , 8");
asm(".set regnum_x9 , 9");
asm(".set regnum_x10 , 10");
asm(".set regnum_x11 , 11");
asm(".set regnum_x12 , 12");
asm(".set regnum_x13 , 13");
asm(".set regnum_x14 , 14");
asm(".set regnum_x15 , 15");
asm(".set regnum_x16 , 16");
asm(".set regnum_x17 , 17");
asm(".set regnum_x18 , 18");
asm(".set regnum_x19 , 19");
asm(".set regnum_x20 , 20");
asm(".set regnum_x21 , 21");
asm(".set regnum_x22 , 22");
asm(".set regnum_x23 , 23");
asm(".set regnum_x24 , 24");
asm(".set regnum_x25 , 25");
asm(".set regnum_x26 , 26");
asm(".set regnum_x27 , 27");
asm(".set regnum_x28 , 28");
asm(".set regnum_x29 , 29");
asm(".set regnum_x30 , 30");
asm(".set regnum_x31 , 31");
asm(".set regnum_zero, 0");
asm(".set regnum_ra , 1");
asm(".set regnum_sp , 2");
asm(".set regnum_gp , 3");
asm(".set regnum_tp , 4");
asm(".set regnum_t0 , 5");
asm(".set regnum_t1 , 6");
asm(".set regnum_t2 , 7");
asm(".set regnum_s0 , 8");
asm(".set regnum_s1 , 9");
asm(".set regnum_a0 , 10");
asm(".set regnum_a1 , 11");
asm(".set regnum_a2 , 12");
asm(".set regnum_a3 , 13");
asm(".set regnum_a4 , 14");
asm(".set regnum_a5 , 15");
asm(".set regnum_a6 , 16");
asm(".set regnum_a7 , 17");
asm(".set regnum_s2 , 18");
asm(".set regnum_s3 , 19");
asm(".set regnum_s4 , 20");
asm(".set regnum_s5 , 21");
asm(".set regnum_s6 , 22");
asm(".set regnum_s7 , 23");
asm(".set regnum_s8 , 24");
asm(".set regnum_s9 , 25");
asm(".set regnum_s10 , 26");
asm(".set regnum_s11 , 27");
asm(".set regnum_t3 , 28");
asm(".set regnum_t4 , 29");
asm(".set regnum_t5 , 30");
asm(".set regnum_t6 , 31");
#define opcode_ld(opcode, func3, base, imm12, o1, o2) \
asm volatile(".word ((" #opcode ") | (regnum_%0 << 7) | (regnum_%2 << 15) | (" #imm12 " << 20) | ((" #func3 ") << 12));" \
: "=&r" (o1), "=&r" (o2) \
: "r" (base) \
); \
#define ld(base, imm12, o1, o2) opcode_ld(0x03, 0x03, base, imm12, o1, o2)
#define ldu(base, imm12, o1, o2) opcode_ld(0x03, 0x07, base, imm12, o1, o2)
#define opcode_sd(opcode, func3, base, imm04, imm511, i1, i2) \
asm volatile(".word ((" #opcode ") | (" #imm04 " << 7) | (regnum_%0 << 15) | (regnum_%1 << 20) | (" #imm511 " << 25) | ((" #func3 ") << 12));" \
: \
: "r" (base), "r" (i1), "r" (i2) \
); \
#define sd(base, imm04, imm511, i1, i2) opcode_sd(0x23, 0x03, base, imm04, imm511, i1, i2)