diff --git a/SheepShaver/src/Unix/sysdeps.h b/SheepShaver/src/Unix/sysdeps.h index 194455a7..b8e575ea 100644 --- a/SheepShaver/src/Unix/sysdeps.h +++ b/SheepShaver/src/Unix/sysdeps.h @@ -88,6 +88,10 @@ #define PPC_PROFILE_COMPILE_TIME 0 #define PPC_PROFILE_GENERIC_CALLS 0 #define KPX_MAX_CPUS 1 +// direct block chaining is only tested on PPC right now +#if defined(__powerpc__) +#define DYNGEN_DIRECT_BLOCK_CHAINING 1 +#endif #else // Mac ROM is write protected #define ROM_IS_WRITE_PROTECTED 1 diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen-ops.cpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen-ops.cpp index ac4e035b..cbea5a58 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen-ops.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen-ops.cpp @@ -273,13 +273,6 @@ DEFINE_OP(8,T0,1,T1); dyngen_barrier(); \ } while (0) -#if defined(__powerpc__) -#define FAST_DISPATCH(TARGET) asm volatile ("b " #TARGET) -#endif -#if defined(__i386__) || defined(__x86_64__) -#define FAST_DISPATCH(TARGET) asm volatile ("jmp " #TARGET) -#endif - extern "C" void OPPROTO op_execute(uint8 *entry_point, basic_cpu *this_cpu); void OPPROTO op_execute(uint8 *entry_point, basic_cpu *this_cpu) { @@ -340,13 +333,18 @@ void OPPROTO op_jmp_slow(void) void OPPROTO op_jmp_fast(void) { -#ifdef FAST_DISPATCH - FAST_DISPATCH(__op_param1); +#ifdef DYNGEN_FAST_DISPATCH + DYNGEN_FAST_DISPATCH(__op_param1); #else SLOW_DISPATCH(PARAM1); #endif } +void OPPROTO op_jmp_A0(void) +{ + SLOW_DISPATCH(reg_A0); +} + // Register calling conventions based arches don't need a stack frame #if defined(__powerpc__) || defined(__x86_64__) #define DEFINE_OP(NAME, CODE) \ @@ -477,4 +475,16 @@ DEFINE_OP(op_invoke_direct_CPU_im_im, { CALL(func(CPU, PARAM2, PARAM3)); }); +DEFINE_OP(op_invoke_CPU_T0_ret_A0, { + typedef void *(*func_t)(void *, uintptr); + func_t func = (func_t)reg_A0; + reg_A0 = (uintptr)CALL(func(CPU, reg_T0)); +}); + +DEFINE_OP(op_invoke_direct_CPU_T0_ret_A0, { + typedef void *(*func_t)(void *, uintptr); + func_t func = (func_t)PARAM1; + reg_A0 = (uintptr)CALL(func(CPU, reg_T0)); +}); + #undef DEFINE_OP diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp index a4b9920a..20e12b05 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp @@ -135,3 +135,15 @@ basic_dyngen::gen_invoke_CPU_im_im(void (*func)(dyngen_cpu_base, uint32, uint32) gen_op_invoke_CPU_im_im(param1, param2); } } + +void +basic_dyngen::gen_invoke_CPU_T0_ret_A0(void *(*func)(dyngen_cpu_base)) +{ + if (direct_call_possible((uintptr)func)) + gen_op_invoke_direct_CPU_T0_ret_A0((uintptr)func); + else { + gen_op_mov_ad_A0_im((uintptr)func); + gen_op_invoke_CPU_T0_ret_A0(); + } +} + diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp index b015055d..49d931c2 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp @@ -84,6 +84,7 @@ public: void gen_invoke_CPU_T0(void (*func)(dyngen_cpu_base, uint32)); void gen_invoke_CPU_im(void (*func)(dyngen_cpu_base, uint32), uint32 value); void gen_invoke_CPU_im_im(void (*func)(dyngen_cpu_base, uint32, uint32), uint32 param1, uint32 param2); + void gen_invoke_CPU_T0_ret_A0(void *(*func)(dyngen_cpu_base)); // Raw aliases #define DEFINE_ALIAS_RAW(NAME, ARGLIST, ARGS) \ @@ -176,6 +177,7 @@ public: // Jump instructions DEFINE_ALIAS(jmp_slow,1); DEFINE_ALIAS(jmp_fast,1); + DEFINE_ALIAS(jmp_A0,0); // Load/Store instructions DEFINE_ALIAS(load_u32_T0_A0_T1,0); @@ -203,8 +205,37 @@ public: #undef DEFINE_ALIAS_2 #undef DEFINE_ALIAS_3 #undef DEFINE_ALIAS_RAW + +#if DYNGEN_DIRECT_BLOCK_CHAINING + // Jump addresses for direct chaining + uint8 *jmp_addr[2]; + + // Set jump target address + void set_jmp_target(uint8 *jmp_addr, uint8 *addr); +#endif }; +#if DYNGEN_DIRECT_BLOCK_CHAINING +inline void +basic_dyngen::set_jmp_target(uint8 *jmp_addr, uint8 *addr) +{ +#if defined(__powerpc__) + // patch the branch destination + uint32 *ptr = (uint32 *)jmp_addr; + uint32 val = *ptr; + val = (val & ~0x03fffffc) | ((addr - jmp_addr) & 0x03fffffc); + *ptr = val; + + // flush icache + asm volatile ("dcbst 0,%0" : : "r"(ptr) : "memory"); + asm volatile ("sync" : : : "memory"); + asm volatile ("icbi 0,%0" : : "r"(ptr) : "memory"); + asm volatile ("sync" : : : "memory"); + asm volatile ("isync" : : : "memory"); +#endif +} +#endif + inline bool basic_dyngen::direct_jump_possible(uintptr target) const { diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen-exec.h b/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen-exec.h index 9f43fad5..4412accb 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen-exec.h +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen-exec.h @@ -96,6 +96,14 @@ extern int __op_cpuparam; #define CPUPARAM ((long)(&__op_cpuparam)) #endif +// Direct block chaining support +#if defined(__powerpc__) +#define DYNGEN_FAST_DISPATCH(TARGET) asm volatile ("b " #TARGET) +#endif +#if defined(__i386__) || defined(__x86_64__) +#define DYNGEN_FAST_DISPATCH(TARGET) asm volatile ("jmp " #TARGET) +#endif + extern int __op_jmp0, __op_jmp1; #endif /* DYNGEN_EXEC_H */ diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen.c b/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen.c index 17374b52..f72a7670 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen.c +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/dyngen.c @@ -867,7 +867,7 @@ void gen_code(const char *name, const char *demangled_name, runtime to do translated block chaining: the offset of the instruction needs to be stored */ - fprintf(outfile, " jmp_offsets[%d] = %d + (code_ptr() - gen_code_buf);\n", + fprintf(outfile, " jmp_addr[%d] = code_ptr() + %d;\n", n, rel->r_offset - start_offset); continue; } diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-config.hpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-config.hpp index 6a2a49f2..104a7a8f 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-config.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-config.hpp @@ -44,6 +44,17 @@ #define DYNGEN_ASM_OPTS 0 #endif +/** + * DYNGEN_DIRECT_BLOCK_CHAINING + * + * Define to enable direct block chaining on platforms supporting + * that feature. e.g. PowerPC. + **/ + +#ifndef DYNGEN_DIRECT_BLOCK_CHAINING +#define DYNGEN_DIRECT_BLOCK_CHAINING 0 +#endif + /** * Helpers to reach JIT backends headers **/ diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-blockinfo.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-blockinfo.hpp index d6170784..c49774d7 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-blockinfo.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-blockinfo.hpp @@ -21,6 +21,7 @@ #ifndef PPC_BLOCKINFO_H #define PPC_BLOCKINFO_H +#include "cpu/jit/jit-config.hpp" #include "nvmemfun.hpp" #include "basic-blockinfo.hpp" @@ -42,6 +43,9 @@ struct powerpc_block_info #endif #if PPC_ENABLE_JIT uint8 * entry_point; +#if DYNGEN_DIRECT_BLOCK_CHAINING + uint8 * jmp_addr[2]; // Jump addresses for direct chaining +#endif #endif uintptr min_pc, max_pc; diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp index 0d29759a..03dee336 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp @@ -431,6 +431,26 @@ bool powerpc_cpu::check_spcflags() return true; } +#if DYNGEN_DIRECT_BLOCK_CHAINING +void *powerpc_cpu::compile_chain_block(block_info *sbi) +{ + // Block index is stuffed into the source basic block pointer, + // which is aligned at least on 4-byte boundaries + const int n = ((uintptr)sbi) & 3; + sbi = (block_info *)(((uintptr)sbi) & ~3L); + const uint32 bpc = sbi->pc; + + const uint32 tpc = pc(); + block_info *tbi = block_cache.find(tpc); + if (tbi == NULL) + tbi = compile_block(tpc); + assert(tbi && tbi->pc == tpc); + + codegen.set_jmp_target(sbi->jmp_addr[n], tbi->entry_point); + return tbi->entry_point; +} +#endif + void powerpc_cpu::execute(uint32 entry) { pc() = entry; @@ -644,6 +664,13 @@ void powerpc_cpu::invalidate_cache_range(uintptr start, uintptr end) { D(bug("Invalidate cache block [%08x - %08x]\n", start, end)); #if PPC_DECODE_CACHE || PPC_ENABLE_JIT +#if DYNGEN_DIRECT_BLOCK_CHAINING + if (use_jit) { + // Invalidate on page boundaries + start &= -4096; + end = (end + 4095) & -4096; + } +#endif block_cache.clear_range(start, end); #endif } diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp index 654e89e3..1d1d1ff0 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp @@ -337,6 +337,9 @@ private: friend class powerpc_dyngen; powerpc_dyngen codegen; block_info *compile_block(uint32 entry); +#if DYNGEN_DIRECT_BLOCK_CHAINING + void *compile_chain_block(block_info *sbi); +#endif #endif // Semantic action templates diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp index 6cf3bcd6..89d1d28f 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp @@ -684,7 +684,7 @@ DEFINE_OP(branch_if_not_T0); #undef DEFINE_OP #undef DEFINE_OP_CTR -template< int bo > +template< int bo, bool chain > static inline void do_execute_branch_bo(uint32 tpc, uint32 npc) { bool ctr_ok = true; @@ -706,20 +706,33 @@ static inline void do_execute_branch_bo(uint32 tpc, uint32 npc) powerpc_dyngen_helper::set_ctr(ctr); } - if (ctr_ok && cond_ok) + if (ctr_ok && cond_ok) { powerpc_dyngen_helper::set_pc(tpc); - else +#ifdef DYNGEN_FAST_DISPATCH + if (chain && powerpc_dyngen_helper::spcflags().empty()) + DYNGEN_FAST_DISPATCH(__op_jmp0); +#endif + } + else { powerpc_dyngen_helper::set_pc(npc); +#ifdef DYNGEN_FAST_DISPATCH + if (chain && powerpc_dyngen_helper::spcflags().empty()) + DYNGEN_FAST_DISPATCH(__op_jmp1); +#endif + } dyngen_barrier(); } #define BO(A,B,C,D) (((A) << 4)| ((B) << 3) | ((C) << 2) | ((D) << 1)) -#define DEFINE_OP(BO_SUFFIX, BO_VALUE) \ -void OPPROTO op_branch_A0_bo_##BO_SUFFIX(void) \ -{ \ - do_execute_branch_bo(A0, PARAM1); \ +#define DEFINE_OP1(BO_SUFFIX, BO_VALUE, CHAIN) \ +void OPPROTO op_branch_A0_bo_##BO_SUFFIX##_##CHAIN(void) \ +{ \ + do_execute_branch_bo(A0, PARAM1); \ } +#define DEFINE_OP(BO_SUFFIX, BO_VALUE) \ +DEFINE_OP1(BO_SUFFIX, BO_VALUE, 0) \ +DEFINE_OP1(BO_SUFFIX, BO_VALUE, 1) DEFINE_OP(0000,(0,0,0,0)); DEFINE_OP(0001,(0,0,0,1)); diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp index 1088afc6..64930d8b 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp @@ -236,34 +236,58 @@ DEFINE_INSN(store, T0); #undef DEFINE_INSN -void powerpc_dyngen::gen_bc_A0(int bo, int bi, uint32 npc) +void powerpc_dyngen::gen_bc_A0(int bo, int bi, uint32 npc, bool direct_chaining) { #if 1 if (BO_CONDITIONAL_BRANCH(bo)) { gen_load_T0_CR(); gen_and_32_T0_im(1 << (31 - bi)); } + if (direct_chaining) { switch (bo >> 1) { #define _(A,B,C,D) (((A) << 3)| ((B) << 2) | ((C) << 1) | (D)) - case _(0,0,0,0): gen_op_branch_A0_bo_0000(npc); break; - case _(0,0,0,1): gen_op_branch_A0_bo_0001(npc); break; + case _(0,0,0,0): gen_op_branch_A0_bo_0000_1(npc); break; + case _(0,0,0,1): gen_op_branch_A0_bo_0001_1(npc); break; case _(0,0,1,0): - case _(0,0,1,1): gen_op_branch_A0_bo_001x(npc); break; - case _(0,1,0,0): gen_op_branch_A0_bo_0100(npc); break; - case _(0,1,0,1): gen_op_branch_A0_bo_0101(npc); break; + case _(0,0,1,1): gen_op_branch_A0_bo_001x_1(npc); break; + case _(0,1,0,0): gen_op_branch_A0_bo_0100_1(npc); break; + case _(0,1,0,1): gen_op_branch_A0_bo_0101_1(npc); break; case _(0,1,1,0): - case _(0,1,1,1): gen_op_branch_A0_bo_011x(npc); break; + case _(0,1,1,1): gen_op_branch_A0_bo_011x_1(npc); break; case _(1,0,0,0): - case _(1,1,0,0): gen_op_branch_A0_bo_1x00(npc); break; + case _(1,1,0,0): gen_op_branch_A0_bo_1x00_1(npc); break; case _(1,0,0,1): - case _(1,1,0,1): gen_op_branch_A0_bo_1x01(npc); break; + case _(1,1,0,1): gen_op_branch_A0_bo_1x01_1(npc); break; case _(1,0,1,0): case _(1,0,1,1): case _(1,1,1,0): - case _(1,1,1,1): gen_op_branch_A0_bo_1x1x(); break; + case _(1,1,1,1): gen_op_branch_A0_bo_1x1x_1(); break; #undef _ default: abort(); } + } else { + switch (bo >> 1) { +#define _(A,B,C,D) (((A) << 3)| ((B) << 2) | ((C) << 1) | (D)) + case _(0,0,0,0): gen_op_branch_A0_bo_0000_0(npc); break; + case _(0,0,0,1): gen_op_branch_A0_bo_0001_0(npc); break; + case _(0,0,1,0): + case _(0,0,1,1): gen_op_branch_A0_bo_001x_0(npc); break; + case _(0,1,0,0): gen_op_branch_A0_bo_0100_0(npc); break; + case _(0,1,0,1): gen_op_branch_A0_bo_0101_0(npc); break; + case _(0,1,1,0): + case _(0,1,1,1): gen_op_branch_A0_bo_011x_0(npc); break; + case _(1,0,0,0): + case _(1,1,0,0): gen_op_branch_A0_bo_1x00_0(npc); break; + case _(1,0,0,1): + case _(1,1,0,1): gen_op_branch_A0_bo_1x01_0(npc); break; + case _(1,0,1,0): + case _(1,0,1,1): + case _(1,1,1,0): + case _(1,1,1,1): gen_op_branch_A0_bo_1x1x_0(); break; +#undef _ + default: abort(); + } + } #else if (BO_CONDITIONAL_BRANCH(bo)) { gen_load_T0_CR(); diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp index 333c7552..78593a0b 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp @@ -219,7 +219,7 @@ public: void gen_store_single_F0_A0_im(int32 offset); // Branch instructions - void gen_bc_A0(int bo, int bi, uint32 npc); + void gen_bc_A0(int bo, int bi, uint32 npc, bool direct_chaining); // Vector instructions void gen_load_ad_VD_VR(int i); diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp index 5ec54a29..3673664e 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp @@ -133,6 +133,9 @@ powerpc_cpu::compile_block(uint32 entry_point) bi->init(entry_point); bi->entry_point = dg.gen_start(); + // Direct block chaining support variables + bool use_direct_block_chaining = false; + uint32 dpc = entry_point - 4; uint32 min_pc, max_pc; min_pc = max_pc = entry_point; @@ -429,6 +432,7 @@ powerpc_cpu::compile_block(uint32 entry_point) break; } case PPC_I(BC): // Branch Conditional + { #if FOLLOW_CONST_JUMPS if (!LK_field::test(opcode)) { const int bo = BO_field::extract(opcode); @@ -440,8 +444,16 @@ powerpc_cpu::compile_block(uint32 entry_point) } } #endif - dg.gen_mov_32_A0_im(((AA_field::test(opcode) ? 0 : dpc) + operand_BD::get(this, opcode)) & -4); + const uint32 tpc = ((AA_field::test(opcode) ? 0 : dpc) + operand_BD::get(this, opcode)) & -4; +#if DYNGEN_DIRECT_BLOCK_CHAINING + // Use direct block chaining for in-page jumps or jumps to ROM area + const uint32 npc = dpc + 4; + if (((tpc & -4096) == (npc & -4096)) || is_read_only_memory(tpc)) + use_direct_block_chaining = true; +#endif + dg.gen_mov_32_A0_im(tpc); goto do_branch; + } case PPC_I(BCCTR): // Branch Conditional to Count Register dg.gen_load_A0_CTR(); goto do_branch; @@ -457,7 +469,7 @@ powerpc_cpu::compile_block(uint32 entry_point) if (LK_field::test(opcode)) dg.gen_store_im_LR(npc); - dg.gen_bc_A0(bo, bi, npc); + dg.gen_bc_A0(bo, bi, npc, use_direct_block_chaining); break; } case PPC_I(B): // Branch @@ -491,7 +503,7 @@ powerpc_cpu::compile_block(uint32 entry_point) dg.gen_mov_32_A0_im(tpc); // BO field is built so that we always branch to A0 - dg.gen_bc_A0(BO_MAKE(0,0,0,0), 0, 0); + dg.gen_bc_A0(BO_MAKE(0,0,0,0), 0, 0, false); break; } case PPC_I(CMP): // Compare @@ -1428,6 +1440,33 @@ powerpc_cpu::compile_block(uint32 entry_point) bi->size = dg.code_ptr() - bi->entry_point; if (disasm) disasm_translation(entry_point, dpc - entry_point + 4, bi->entry_point, bi->size); + +#if DYNGEN_DIRECT_BLOCK_CHAINING + // Generate backpatch trampolines + if (use_direct_block_chaining) { + typedef void *(*func_t)(dyngen_cpu_base); + func_t func = (func_t)nv_mem_fun(&powerpc_cpu::compile_chain_block).ptr(); + + // Taken PC + uint8 *p = dg.gen_start(); + dg.gen_mov_ad_T0_im(((uintptr)bi) | 0); + dg.gen_invoke_CPU_T0_ret_A0(func); + dg.gen_jmp_A0(); + dg.gen_end(); + bi->jmp_addr[0] = dg.jmp_addr[0]; + dg.set_jmp_target(dg.jmp_addr[0], p); + + // Not taken PC + p = dg.gen_start(); + dg.gen_mov_ad_T0_im(((uintptr)bi) | 1); + dg.gen_invoke_CPU_T0_ret_A0(func); + dg.gen_jmp_A0(); + dg.gen_end(); + bi->jmp_addr[1] = dg.jmp_addr[1]; + dg.set_jmp_target(dg.jmp_addr[1], p); + } +#endif + block_cache.add_to_cl_list(bi); if (is_read_only_memory(bi->pc)) block_cache.add_to_dormant_list(bi);