From dd87235fc04ea67486bea302955c69612bde8543 Mon Sep 17 00:00:00 2001 From: Glenn Anderson Date: Fri, 18 Mar 2022 15:51:22 -0700 Subject: [PATCH] Pin down alignment for writeDataLoop and readDataLoop, and perform some additional timing adjustment. --- src/BlueSCSI.cpp | 65 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/src/BlueSCSI.cpp b/src/BlueSCSI.cpp index 6d3d64e..3b5a5a7 100644 --- a/src/BlueSCSI.cpp +++ b/src/BlueSCSI.cpp @@ -743,27 +743,35 @@ void writeDataPhase(int len, const byte* p) } #if READ_SPEED_OPTIMIZE +#pragma GCC push_options +#pragma GCC optimize ("-Os") /* * This loop is tuned to repeat the following pattern: * 1) Set REQ - * 2) 5-6 cycles of work/delay + * 2) 5 cycles of work/delay * 3) Wait for ACK * Cycle time tunings are for 72MHz STM32F103 + * Alignment matters. For the 3 instruction wait loops,it looks like crossing + * an 8 byte prefetch buffer can add 2 cycles of wait every branch taken. */ +void writeDataLoop(uint32_t blocksize) __attribute__ ((aligned(8))); void writeDataLoop(uint32_t blocksize) { -#define REQ_ON() (*db_dst = BITMASK(vREQ)<<16); +#define REQ_ON() (port_b->BRR = req_bit); #define FETCH_BSRR_DB() (bsrr_val = bsrr_tbl[*srcptr++]) -#define REQ_OFF_DB_SET(BSRR_VAL) *db_dst = BSRR_VAL; -#define WAIT_ACK_ACTIVE() while(!SCSI_IN(vACK)) -#define WAIT_ACK_INACTIVE() while(SCSI_IN(vACK)) +#define REQ_OFF_DB_SET(BSRR_VAL) port_b->BSRR = BSRR_VAL; +#define WAIT_ACK_ACTIVE() while((*port_a_idr>>(vACK&15)&1)) +#define WAIT_ACK_INACTIVE() while(!(*port_a_idr>>(vACK&15)&1)) register byte *srcptr= m_buf; // Source buffer register byte *endptr= m_buf + blocksize; // End pointer register const uint32_t *bsrr_tbl = db_bsrr; // Table to convert to BSRR register uint32_t bsrr_val; // BSRR value to output (DB, DBP, REQ = ACTIVE) - register volatile uint32_t *db_dst = &(GPIOB->regs->BSRR); // Output port + + register uint32_t req_bit = BITMASK(vREQ); + register gpio_reg_map *port_b = PBREG; + register volatile uint32_t *port_a_idr = &(GPIOA->regs->IDR); // Start the first bus cycle. FETCH_BSRR_DB(); @@ -772,19 +780,19 @@ void writeDataLoop(uint32_t blocksize) FETCH_BSRR_DB(); WAIT_ACK_ACTIVE(); REQ_OFF_DB_SET(bsrr_val); + // Align the starts of the do/while and WAIT loops to an 8 byte prefetch. + asm("nop.w;nop"); do{ WAIT_ACK_INACTIVE(); REQ_ON(); - // 6 cycle delay before reading ACK. - // Store plus 2 loads is 6 cycles. - REQ_ON(); + // 4 cycles of work FETCH_BSRR_DB(); + // Extra 1 cycle delay while keeping the loop within an 8 byte prefetch. + asm("nop"); WAIT_ACK_ACTIVE(); REQ_OFF_DB_SET(bsrr_val); - // 5 cycle delay before reading ACK. - // Branch taken is 2-4, seems to be taking 3. A second write is 2 more cycles. - // cmp is being pipelined in to a store so doesn't add any time. - REQ_OFF_DB_SET(bsrr_val); + // Extra 1 cycle delay, plus 4 cycles for the branch taken with prefetch. + asm("nop"); }while(srcptr < endptr); WAIT_ACK_INACTIVE(); // Finish the last bus cycle, byte is already on DB. @@ -793,6 +801,7 @@ void writeDataLoop(uint32_t blocksize) REQ_OFF_DB_SET(bsrr_val); WAIT_ACK_INACTIVE(); } +#pragma GCC pop_options #endif /* @@ -841,32 +850,48 @@ void readDataPhase(int len, byte* p) p[i] = readHandshake(); } +#if WRITE_SPEED_OPTIMIZE +#pragma GCC push_options +#pragma GCC optimize ("-Os") + +/* + * See writeDataLoop for optimization info. + */ +void readDataLoop(uint32_t blockSize) __attribute__ ((aligned(8))); void readDataLoop(uint32_t blockSize) { register byte *dstptr= m_buf; register byte *endptr= m_buf + blockSize - 1; -#define REQ_ON() (port_b->BSRR = BITMASK(vREQ)<<16); -#define REQ_OFF() (port_b->BSRR = BITMASK(vREQ)); -#define WAIT_ACK_ACTIVE() while((*ack_src>>(vACK&15)&1)) -#define WAIT_ACK_INACTIVE() while(!(*ack_src>>(vACK&15)&1)) +#define REQ_ON() (port_b->BRR = req_bit); +#define REQ_OFF() (port_b->BSRR = req_bit); +#define WAIT_ACK_ACTIVE() while((*port_a_idr>>(vACK&15)&1)) +#define WAIT_ACK_INACTIVE() while(!(*port_a_idr>>(vACK&15)&1)) + register uint32_t req_bit = BITMASK(vREQ); register gpio_reg_map *port_b = PBREG; - register volatile uint32_t *ack_src = &(GPIOA->regs->IDR); + register volatile uint32_t *port_a_idr = &(GPIOA->regs->IDR); REQ_ON(); + // Start of the do/while and WAIT are already aligned to 8 bytes. do { WAIT_ACK_ACTIVE(); - uint32_t ret = GPIOB->regs->IDR; + uint32_t ret = port_b->IDR; REQ_OFF(); *dstptr++ = ~(ret >> 8); + // Move wait loop in to a single 8 byte prefetch buffer + asm("nop.w"); WAIT_ACK_INACTIVE(); REQ_ON(); + // Extra 1 cycle delay + asm("nop"); } while(dstptrregs->IDR; REQ_OFF(); - *dstptr++ = ~(ret >> 8); + *dstptr = ~(ret >> 8); WAIT_ACK_INACTIVE(); } +#pragma GCC pop_options +#endif /* * Data out phase.