direct block chaining, aka faster block dispatcher

This commit is contained in:
gbeauche 2004-05-11 20:53:25 +00:00
parent 6c0e2a9f2a
commit 08bcd2653d
14 changed files with 217 additions and 31 deletions

View File

@ -88,6 +88,10 @@
#define PPC_PROFILE_COMPILE_TIME 0
#define PPC_PROFILE_GENERIC_CALLS 0
#define KPX_MAX_CPUS 1
// direct block chaining is only tested on PPC right now
#if defined(__powerpc__)
#define DYNGEN_DIRECT_BLOCK_CHAINING 1
#endif
#else
// Mac ROM is write protected
#define ROM_IS_WRITE_PROTECTED 1

View File

@ -273,13 +273,6 @@ DEFINE_OP(8,T0,1,T1);
dyngen_barrier(); \
} while (0)
#if defined(__powerpc__)
#define FAST_DISPATCH(TARGET) asm volatile ("b " #TARGET)
#endif
#if defined(__i386__) || defined(__x86_64__)
#define FAST_DISPATCH(TARGET) asm volatile ("jmp " #TARGET)
#endif
extern "C" void OPPROTO op_execute(uint8 *entry_point, basic_cpu *this_cpu);
void OPPROTO op_execute(uint8 *entry_point, basic_cpu *this_cpu)
{
@ -340,13 +333,18 @@ void OPPROTO op_jmp_slow(void)
void OPPROTO op_jmp_fast(void)
{
#ifdef FAST_DISPATCH
FAST_DISPATCH(__op_param1);
#ifdef DYNGEN_FAST_DISPATCH
DYNGEN_FAST_DISPATCH(__op_param1);
#else
SLOW_DISPATCH(PARAM1);
#endif
}
void OPPROTO op_jmp_A0(void)
{
SLOW_DISPATCH(reg_A0);
}
// Register calling conventions based arches don't need a stack frame
#if defined(__powerpc__) || defined(__x86_64__)
#define DEFINE_OP(NAME, CODE) \
@ -477,4 +475,16 @@ DEFINE_OP(op_invoke_direct_CPU_im_im, {
CALL(func(CPU, PARAM2, PARAM3));
});
DEFINE_OP(op_invoke_CPU_T0_ret_A0, {
typedef void *(*func_t)(void *, uintptr);
func_t func = (func_t)reg_A0;
reg_A0 = (uintptr)CALL(func(CPU, reg_T0));
});
DEFINE_OP(op_invoke_direct_CPU_T0_ret_A0, {
typedef void *(*func_t)(void *, uintptr);
func_t func = (func_t)PARAM1;
reg_A0 = (uintptr)CALL(func(CPU, reg_T0));
});
#undef DEFINE_OP

View File

@ -135,3 +135,15 @@ basic_dyngen::gen_invoke_CPU_im_im(void (*func)(dyngen_cpu_base, uint32, uint32)
gen_op_invoke_CPU_im_im(param1, param2);
}
}
void
basic_dyngen::gen_invoke_CPU_T0_ret_A0(void *(*func)(dyngen_cpu_base))
{
if (direct_call_possible((uintptr)func))
gen_op_invoke_direct_CPU_T0_ret_A0((uintptr)func);
else {
gen_op_mov_ad_A0_im((uintptr)func);
gen_op_invoke_CPU_T0_ret_A0();
}
}

View File

@ -84,6 +84,7 @@ public:
void gen_invoke_CPU_T0(void (*func)(dyngen_cpu_base, uint32));
void gen_invoke_CPU_im(void (*func)(dyngen_cpu_base, uint32), uint32 value);
void gen_invoke_CPU_im_im(void (*func)(dyngen_cpu_base, uint32, uint32), uint32 param1, uint32 param2);
void gen_invoke_CPU_T0_ret_A0(void *(*func)(dyngen_cpu_base));
// Raw aliases
#define DEFINE_ALIAS_RAW(NAME, ARGLIST, ARGS) \
@ -176,6 +177,7 @@ public:
// Jump instructions
DEFINE_ALIAS(jmp_slow,1);
DEFINE_ALIAS(jmp_fast,1);
DEFINE_ALIAS(jmp_A0,0);
// Load/Store instructions
DEFINE_ALIAS(load_u32_T0_A0_T1,0);
@ -203,8 +205,37 @@ public:
#undef DEFINE_ALIAS_2
#undef DEFINE_ALIAS_3
#undef DEFINE_ALIAS_RAW
#if DYNGEN_DIRECT_BLOCK_CHAINING
// Jump addresses for direct chaining
uint8 *jmp_addr[2];
// Set jump target address
void set_jmp_target(uint8 *jmp_addr, uint8 *addr);
#endif
};
#if DYNGEN_DIRECT_BLOCK_CHAINING
inline void
basic_dyngen::set_jmp_target(uint8 *jmp_addr, uint8 *addr)
{
#if defined(__powerpc__)
// patch the branch destination
uint32 *ptr = (uint32 *)jmp_addr;
uint32 val = *ptr;
val = (val & ~0x03fffffc) | ((addr - jmp_addr) & 0x03fffffc);
*ptr = val;
// flush icache
asm volatile ("dcbst 0,%0" : : "r"(ptr) : "memory");
asm volatile ("sync" : : : "memory");
asm volatile ("icbi 0,%0" : : "r"(ptr) : "memory");
asm volatile ("sync" : : : "memory");
asm volatile ("isync" : : : "memory");
#endif
}
#endif
inline bool
basic_dyngen::direct_jump_possible(uintptr target) const
{

View File

@ -96,6 +96,14 @@ extern int __op_cpuparam;
#define CPUPARAM ((long)(&__op_cpuparam))
#endif
// Direct block chaining support
#if defined(__powerpc__)
#define DYNGEN_FAST_DISPATCH(TARGET) asm volatile ("b " #TARGET)
#endif
#if defined(__i386__) || defined(__x86_64__)
#define DYNGEN_FAST_DISPATCH(TARGET) asm volatile ("jmp " #TARGET)
#endif
extern int __op_jmp0, __op_jmp1;
#endif /* DYNGEN_EXEC_H */

View File

@ -867,7 +867,7 @@ void gen_code(const char *name, const char *demangled_name,
runtime to do translated block
chaining: the offset of the instruction
needs to be stored */
fprintf(outfile, " jmp_offsets[%d] = %d + (code_ptr() - gen_code_buf);\n",
fprintf(outfile, " jmp_addr[%d] = code_ptr() + %d;\n",
n, rel->r_offset - start_offset);
continue;
}

View File

@ -44,6 +44,17 @@
#define DYNGEN_ASM_OPTS 0
#endif
/**
* DYNGEN_DIRECT_BLOCK_CHAINING
*
* Define to enable direct block chaining on platforms supporting
* that feature. e.g. PowerPC.
**/
#ifndef DYNGEN_DIRECT_BLOCK_CHAINING
#define DYNGEN_DIRECT_BLOCK_CHAINING 0
#endif
/**
* Helpers to reach JIT backends headers
**/

View File

@ -21,6 +21,7 @@
#ifndef PPC_BLOCKINFO_H
#define PPC_BLOCKINFO_H
#include "cpu/jit/jit-config.hpp"
#include "nvmemfun.hpp"
#include "basic-blockinfo.hpp"
@ -42,6 +43,9 @@ struct powerpc_block_info
#endif
#if PPC_ENABLE_JIT
uint8 * entry_point;
#if DYNGEN_DIRECT_BLOCK_CHAINING
uint8 * jmp_addr[2]; // Jump addresses for direct chaining
#endif
#endif
uintptr min_pc, max_pc;

View File

@ -431,6 +431,26 @@ bool powerpc_cpu::check_spcflags()
return true;
}
#if DYNGEN_DIRECT_BLOCK_CHAINING
void *powerpc_cpu::compile_chain_block(block_info *sbi)
{
// Block index is stuffed into the source basic block pointer,
// which is aligned at least on 4-byte boundaries
const int n = ((uintptr)sbi) & 3;
sbi = (block_info *)(((uintptr)sbi) & ~3L);
const uint32 bpc = sbi->pc;
const uint32 tpc = pc();
block_info *tbi = block_cache.find(tpc);
if (tbi == NULL)
tbi = compile_block(tpc);
assert(tbi && tbi->pc == tpc);
codegen.set_jmp_target(sbi->jmp_addr[n], tbi->entry_point);
return tbi->entry_point;
}
#endif
void powerpc_cpu::execute(uint32 entry)
{
pc() = entry;
@ -644,6 +664,13 @@ void powerpc_cpu::invalidate_cache_range(uintptr start, uintptr end)
{
D(bug("Invalidate cache block [%08x - %08x]\n", start, end));
#if PPC_DECODE_CACHE || PPC_ENABLE_JIT
#if DYNGEN_DIRECT_BLOCK_CHAINING
if (use_jit) {
// Invalidate on page boundaries
start &= -4096;
end = (end + 4095) & -4096;
}
#endif
block_cache.clear_range(start, end);
#endif
}

View File

@ -337,6 +337,9 @@ private:
friend class powerpc_dyngen;
powerpc_dyngen codegen;
block_info *compile_block(uint32 entry);
#if DYNGEN_DIRECT_BLOCK_CHAINING
void *compile_chain_block(block_info *sbi);
#endif
#endif
// Semantic action templates

View File

@ -684,7 +684,7 @@ DEFINE_OP(branch_if_not_T0);
#undef DEFINE_OP
#undef DEFINE_OP_CTR
template< int bo >
template< int bo, bool chain >
static inline void do_execute_branch_bo(uint32 tpc, uint32 npc)
{
bool ctr_ok = true;
@ -706,20 +706,33 @@ static inline void do_execute_branch_bo(uint32 tpc, uint32 npc)
powerpc_dyngen_helper::set_ctr(ctr);
}
if (ctr_ok && cond_ok)
if (ctr_ok && cond_ok) {
powerpc_dyngen_helper::set_pc(tpc);
else
#ifdef DYNGEN_FAST_DISPATCH
if (chain && powerpc_dyngen_helper::spcflags().empty())
DYNGEN_FAST_DISPATCH(__op_jmp0);
#endif
}
else {
powerpc_dyngen_helper::set_pc(npc);
#ifdef DYNGEN_FAST_DISPATCH
if (chain && powerpc_dyngen_helper::spcflags().empty())
DYNGEN_FAST_DISPATCH(__op_jmp1);
#endif
}
dyngen_barrier();
}
#define BO(A,B,C,D) (((A) << 4)| ((B) << 3) | ((C) << 2) | ((D) << 1))
#define DEFINE_OP(BO_SUFFIX, BO_VALUE) \
void OPPROTO op_branch_A0_bo_##BO_SUFFIX(void) \
{ \
do_execute_branch_bo<BO BO_VALUE>(A0, PARAM1); \
#define DEFINE_OP1(BO_SUFFIX, BO_VALUE, CHAIN) \
void OPPROTO op_branch_A0_bo_##BO_SUFFIX##_##CHAIN(void) \
{ \
do_execute_branch_bo<BO BO_VALUE, CHAIN>(A0, PARAM1); \
}
#define DEFINE_OP(BO_SUFFIX, BO_VALUE) \
DEFINE_OP1(BO_SUFFIX, BO_VALUE, 0) \
DEFINE_OP1(BO_SUFFIX, BO_VALUE, 1)
DEFINE_OP(0000,(0,0,0,0));
DEFINE_OP(0001,(0,0,0,1));

View File

@ -236,34 +236,58 @@ DEFINE_INSN(store, T0);
#undef DEFINE_INSN
void powerpc_dyngen::gen_bc_A0(int bo, int bi, uint32 npc)
void powerpc_dyngen::gen_bc_A0(int bo, int bi, uint32 npc, bool direct_chaining)
{
#if 1
if (BO_CONDITIONAL_BRANCH(bo)) {
gen_load_T0_CR();
gen_and_32_T0_im(1 << (31 - bi));
}
if (direct_chaining) {
switch (bo >> 1) {
#define _(A,B,C,D) (((A) << 3)| ((B) << 2) | ((C) << 1) | (D))
case _(0,0,0,0): gen_op_branch_A0_bo_0000(npc); break;
case _(0,0,0,1): gen_op_branch_A0_bo_0001(npc); break;
case _(0,0,0,0): gen_op_branch_A0_bo_0000_1(npc); break;
case _(0,0,0,1): gen_op_branch_A0_bo_0001_1(npc); break;
case _(0,0,1,0):
case _(0,0,1,1): gen_op_branch_A0_bo_001x(npc); break;
case _(0,1,0,0): gen_op_branch_A0_bo_0100(npc); break;
case _(0,1,0,1): gen_op_branch_A0_bo_0101(npc); break;
case _(0,0,1,1): gen_op_branch_A0_bo_001x_1(npc); break;
case _(0,1,0,0): gen_op_branch_A0_bo_0100_1(npc); break;
case _(0,1,0,1): gen_op_branch_A0_bo_0101_1(npc); break;
case _(0,1,1,0):
case _(0,1,1,1): gen_op_branch_A0_bo_011x(npc); break;
case _(0,1,1,1): gen_op_branch_A0_bo_011x_1(npc); break;
case _(1,0,0,0):
case _(1,1,0,0): gen_op_branch_A0_bo_1x00(npc); break;
case _(1,1,0,0): gen_op_branch_A0_bo_1x00_1(npc); break;
case _(1,0,0,1):
case _(1,1,0,1): gen_op_branch_A0_bo_1x01(npc); break;
case _(1,1,0,1): gen_op_branch_A0_bo_1x01_1(npc); break;
case _(1,0,1,0):
case _(1,0,1,1):
case _(1,1,1,0):
case _(1,1,1,1): gen_op_branch_A0_bo_1x1x(); break;
case _(1,1,1,1): gen_op_branch_A0_bo_1x1x_1(); break;
#undef _
default: abort();
}
} else {
switch (bo >> 1) {
#define _(A,B,C,D) (((A) << 3)| ((B) << 2) | ((C) << 1) | (D))
case _(0,0,0,0): gen_op_branch_A0_bo_0000_0(npc); break;
case _(0,0,0,1): gen_op_branch_A0_bo_0001_0(npc); break;
case _(0,0,1,0):
case _(0,0,1,1): gen_op_branch_A0_bo_001x_0(npc); break;
case _(0,1,0,0): gen_op_branch_A0_bo_0100_0(npc); break;
case _(0,1,0,1): gen_op_branch_A0_bo_0101_0(npc); break;
case _(0,1,1,0):
case _(0,1,1,1): gen_op_branch_A0_bo_011x_0(npc); break;
case _(1,0,0,0):
case _(1,1,0,0): gen_op_branch_A0_bo_1x00_0(npc); break;
case _(1,0,0,1):
case _(1,1,0,1): gen_op_branch_A0_bo_1x01_0(npc); break;
case _(1,0,1,0):
case _(1,0,1,1):
case _(1,1,1,0):
case _(1,1,1,1): gen_op_branch_A0_bo_1x1x_0(); break;
#undef _
default: abort();
}
}
#else
if (BO_CONDITIONAL_BRANCH(bo)) {
gen_load_T0_CR();

View File

@ -219,7 +219,7 @@ public:
void gen_store_single_F0_A0_im(int32 offset);
// Branch instructions
void gen_bc_A0(int bo, int bi, uint32 npc);
void gen_bc_A0(int bo, int bi, uint32 npc, bool direct_chaining);
// Vector instructions
void gen_load_ad_VD_VR(int i);

View File

@ -133,6 +133,9 @@ powerpc_cpu::compile_block(uint32 entry_point)
bi->init(entry_point);
bi->entry_point = dg.gen_start();
// Direct block chaining support variables
bool use_direct_block_chaining = false;
uint32 dpc = entry_point - 4;
uint32 min_pc, max_pc;
min_pc = max_pc = entry_point;
@ -429,6 +432,7 @@ powerpc_cpu::compile_block(uint32 entry_point)
break;
}
case PPC_I(BC): // Branch Conditional
{
#if FOLLOW_CONST_JUMPS
if (!LK_field::test(opcode)) {
const int bo = BO_field::extract(opcode);
@ -440,8 +444,16 @@ powerpc_cpu::compile_block(uint32 entry_point)
}
}
#endif
dg.gen_mov_32_A0_im(((AA_field::test(opcode) ? 0 : dpc) + operand_BD::get(this, opcode)) & -4);
const uint32 tpc = ((AA_field::test(opcode) ? 0 : dpc) + operand_BD::get(this, opcode)) & -4;
#if DYNGEN_DIRECT_BLOCK_CHAINING
// Use direct block chaining for in-page jumps or jumps to ROM area
const uint32 npc = dpc + 4;
if (((tpc & -4096) == (npc & -4096)) || is_read_only_memory(tpc))
use_direct_block_chaining = true;
#endif
dg.gen_mov_32_A0_im(tpc);
goto do_branch;
}
case PPC_I(BCCTR): // Branch Conditional to Count Register
dg.gen_load_A0_CTR();
goto do_branch;
@ -457,7 +469,7 @@ powerpc_cpu::compile_block(uint32 entry_point)
if (LK_field::test(opcode))
dg.gen_store_im_LR(npc);
dg.gen_bc_A0(bo, bi, npc);
dg.gen_bc_A0(bo, bi, npc, use_direct_block_chaining);
break;
}
case PPC_I(B): // Branch
@ -491,7 +503,7 @@ powerpc_cpu::compile_block(uint32 entry_point)
dg.gen_mov_32_A0_im(tpc);
// BO field is built so that we always branch to A0
dg.gen_bc_A0(BO_MAKE(0,0,0,0), 0, 0);
dg.gen_bc_A0(BO_MAKE(0,0,0,0), 0, 0, false);
break;
}
case PPC_I(CMP): // Compare
@ -1428,6 +1440,33 @@ powerpc_cpu::compile_block(uint32 entry_point)
bi->size = dg.code_ptr() - bi->entry_point;
if (disasm)
disasm_translation(entry_point, dpc - entry_point + 4, bi->entry_point, bi->size);
#if DYNGEN_DIRECT_BLOCK_CHAINING
// Generate backpatch trampolines
if (use_direct_block_chaining) {
typedef void *(*func_t)(dyngen_cpu_base);
func_t func = (func_t)nv_mem_fun(&powerpc_cpu::compile_chain_block).ptr();
// Taken PC
uint8 *p = dg.gen_start();
dg.gen_mov_ad_T0_im(((uintptr)bi) | 0);
dg.gen_invoke_CPU_T0_ret_A0(func);
dg.gen_jmp_A0();
dg.gen_end();
bi->jmp_addr[0] = dg.jmp_addr[0];
dg.set_jmp_target(dg.jmp_addr[0], p);
// Not taken PC
p = dg.gen_start();
dg.gen_mov_ad_T0_im(((uintptr)bi) | 1);
dg.gen_invoke_CPU_T0_ret_A0(func);
dg.gen_jmp_A0();
dg.gen_end();
bi->jmp_addr[1] = dg.jmp_addr[1];
dg.set_jmp_target(dg.jmp_addr[1], p);
}
#endif
block_cache.add_to_cl_list(bi);
if (is_read_only_memory(bi->pc))
block_cache.add_to_dormant_list(bi);