- Rewrite raw_init_cpu() to match more details, from kernel sources.

- Add possibility to tune code alignment to the underlying processor. However,
  this is turned off as I don't see much improvement and align_jumps = 64
  for Athlon looks suspicious to me.
- Remove two extra align_target() that are already covered.
- Remove unused may_trap() predicate.
This commit is contained in:
gbeauche 2002-09-19 14:59:03 +00:00
parent feca66d43e
commit ecd3db832e
2 changed files with 228 additions and 101 deletions

View File

@ -2343,94 +2343,227 @@ static void vec(int x, struct sigcontext sc)
* Checking for CPU features *
*************************************************************************/
typedef struct {
uae_u32 eax;
uae_u32 ecx;
uae_u32 edx;
uae_u32 ebx;
} x86_regs;
struct cpuinfo_x86 {
uae_u8 x86; // CPU family
uae_u8 x86_vendor; // CPU vendor
uae_u8 x86_processor; // CPU canonical processor type
uae_u8 x86_brand_id; // CPU BrandID if supported, yield 0 otherwise
uae_u32 x86_hwcap;
uae_u8 x86_model;
uae_u8 x86_mask;
int cpuid_level; // Maximum supported CPUID level, -1=no CPUID
char x86_vendor_id[16];
};
struct cpuinfo_x86 cpuinfo;
enum {
X86_VENDOR_INTEL = 0,
X86_VENDOR_CYRIX = 1,
X86_VENDOR_AMD = 2,
X86_VENDOR_UMC = 3,
X86_VENDOR_NEXGEN = 4,
X86_VENDOR_CENTAUR = 5,
X86_VENDOR_RISE = 6,
X86_VENDOR_TRANSMETA = 7,
X86_VENDOR_NSC = 8,
X86_VENDOR_UNKNOWN = 0xff
};
/* This could be so much easier if it could make assumptions about the
compiler... */
enum {
X86_PROCESSOR_I386, /* 80386 */
X86_PROCESSOR_I486, /* 80486DX, 80486SX, 80486DX[24] */
X86_PROCESSOR_PENTIUM,
X86_PROCESSOR_PENTIUMPRO,
X86_PROCESSOR_K6,
X86_PROCESSOR_ATHLON,
X86_PROCESSOR_PENTIUM4,
X86_PROCESSOR_max
};
static uae_u8 cpuid_space[256];
static uae_u32 cpuid_ptr;
static uae_u32 cpuid_level;
static const char * x86_processor_string_table[X86_PROCESSOR_max] = {
"80386",
"80486",
"Pentium",
"PentiumPro",
"K6",
"Athlon",
"Pentium4"
};
static x86_regs cpuid(uae_u32 level)
static struct ptt {
const int align_loop;
const int align_loop_max_skip;
const int align_jump;
const int align_jump_max_skip;
const int align_func;
}
x86_alignments[X86_PROCESSOR_max] = {
{ 4, 3, 4, 3, 4 },
{ 16, 15, 16, 15, 16 },
{ 16, 7, 16, 7, 16 },
{ 16, 15, 16, 7, 16 },
{ 32, 7, 32, 7, 32 },
{ 16, 7, 64, 7, 16 },
{ 0, 0, 0, 0, 0 }
};
static void
x86_get_cpu_vendor(struct cpuinfo_x86 *c)
{
x86_regs answer;
uae_u8* tmp=get_target();
char *v = c->x86_vendor_id;
cpuid_ptr=(uae_u32)&answer;
cpuid_level=level;
set_target(cpuid_space);
raw_push_l_r(0); /* eax */
raw_push_l_r(1); /* ecx */
raw_push_l_r(2); /* edx */
raw_push_l_r(3); /* ebx */
raw_push_l_r(7); /* edi */
raw_mov_l_rm(0,(uae_u32)&cpuid_level);
raw_cpuid(0);
raw_mov_l_rm(7,(uae_u32)&cpuid_ptr);
raw_mov_l_Rr(7,0,0);
raw_mov_l_Rr(7,1,4);
raw_mov_l_Rr(7,2,8);
raw_mov_l_Rr(7,3,12);
raw_pop_l_r(7);
raw_pop_l_r(3);
raw_pop_l_r(2);
raw_pop_l_r(1);
raw_pop_l_r(0);
raw_ret();
set_target(tmp);
((cpuop_func*)cpuid_space)(0);
return answer;
if (!strcmp(v, "GenuineIntel"))
c->x86_vendor = X86_VENDOR_INTEL;
else if (!strcmp(v, "AuthenticAMD"))
c->x86_vendor = X86_VENDOR_AMD;
else if (!strcmp(v, "CyrixInstead"))
c->x86_vendor = X86_VENDOR_CYRIX;
else if (!strcmp(v, "Geode by NSC"))
c->x86_vendor = X86_VENDOR_NSC;
else if (!strcmp(v, "UMC UMC UMC "))
c->x86_vendor = X86_VENDOR_UMC;
else if (!strcmp(v, "CentaurHauls"))
c->x86_vendor = X86_VENDOR_CENTAUR;
else if (!strcmp(v, "NexGenDriven"))
c->x86_vendor = X86_VENDOR_NEXGEN;
else if (!strcmp(v, "RiseRiseRise"))
c->x86_vendor = X86_VENDOR_RISE;
else if (!strcmp(v, "GenuineTMx86") ||
!strcmp(v, "TransmetaCPU"))
c->x86_vendor = X86_VENDOR_TRANSMETA;
else
c->x86_vendor = X86_VENDOR_UNKNOWN;
}
static void raw_init_cpu(void)
static void
cpuid(uae_u32 op, uae_u32 *eax, uae_u32 *ebx, uae_u32 *ecx, uae_u32 *edx)
{
x86_regs x;
uae_u32 maxlev;
x=cpuid(0);
maxlev=x.eax;
write_log("Max CPUID level=%d Processor is %c%c%c%c%c%c%c%c%c%c%c%c\n",
maxlev,
x.ebx,
x.ebx>>8,
x.ebx>>16,
x.ebx>>24,
x.edx,
x.edx>>8,
x.edx>>16,
x.edx>>24,
x.ecx,
x.ecx>>8,
x.ecx>>16,
x.ecx>>24
);
have_rat_stall=(x.ecx==0x6c65746e);
static uae_u8 cpuid_space[256];
uae_u8* tmp=get_target();
if (maxlev>=1) {
x=cpuid(1);
if (x.edx&(1<<15))
have_cmov=1;
}
if (!have_cmov)
have_rat_stall=0;
#if 0 /* For testing of non-cmov code! */
have_cmov=0;
#endif
#if 1 /* It appears that partial register writes are a bad idea even on
set_target(cpuid_space);
raw_push_l_r(0); /* eax */
raw_push_l_r(1); /* ecx */
raw_push_l_r(2); /* edx */
raw_push_l_r(3); /* ebx */
raw_mov_l_rm(0,(uae_u32)&op);
raw_cpuid(0);
if (eax != NULL) raw_mov_l_mr((uae_u32)eax,0);
if (ebx != NULL) raw_mov_l_mr((uae_u32)ebx,3);
if (ecx != NULL) raw_mov_l_mr((uae_u32)ecx,1);
if (edx != NULL) raw_mov_l_mr((uae_u32)edx,2);
raw_pop_l_r(3);
raw_pop_l_r(2);
raw_pop_l_r(1);
raw_pop_l_r(0);
raw_ret();
set_target(tmp);
((cpuop_func*)cpuid_space)(0);
}
static void
raw_init_cpu(void)
{
struct cpuinfo_x86 *c = &cpuinfo;
/* Defaults */
c->x86_vendor = X86_VENDOR_UNKNOWN;
c->cpuid_level = -1; /* CPUID not detected */
c->x86_model = c->x86_mask = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_hwcap = 0;
/* Get vendor name */
c->x86_vendor_id[12] = '\0';
cpuid(0x00000000,
(uae_u32 *)&c->cpuid_level,
(uae_u32 *)&c->x86_vendor_id[0],
(uae_u32 *)&c->x86_vendor_id[8],
(uae_u32 *)&c->x86_vendor_id[4]);
x86_get_cpu_vendor(c);
/* Intel-defined flags: level 0x00000001 */
c->x86_brand_id = 0;
if ( c->cpuid_level >= 0x00000001 ) {
uae_u32 tfms, brand_id;
cpuid(0x00000001, &tfms, &brand_id, NULL, &c->x86_hwcap);
c->x86 = (tfms >> 8) & 15;
c->x86_model = (tfms >> 4) & 15;
c->x86_brand_id = brand_id & 0xff;
if ( (c->x86_vendor == X86_VENDOR_AMD) &&
(c->x86 == 0xf)) {
/* AMD Extended Family and Model Values */
c->x86 += (tfms >> 20) & 0xff;
c->x86_model += (tfms >> 12) & 0xf0;
}
c->x86_mask = tfms & 15;
} else {
/* Have CPUID level 0 only - unheard of */
c->x86 = 4;
}
/* Canonicalize processor ID */
c->x86_processor = X86_PROCESSOR_max;
switch (c->x86) {
case 3:
c->x86_processor = X86_PROCESSOR_I386;
break;
case 4:
c->x86_processor = X86_PROCESSOR_I486;
break;
case 5:
if (c->x86_vendor == X86_VENDOR_AMD)
c->x86_processor = X86_PROCESSOR_K6;
else
c->x86_processor = X86_PROCESSOR_PENTIUM;
break;
case 6:
if (c->x86_vendor == X86_VENDOR_AMD)
c->x86_processor = X86_PROCESSOR_ATHLON;
else
c->x86_processor = X86_PROCESSOR_PENTIUMPRO;
break;
case 15:
if (c->x86_vendor == X86_VENDOR_INTEL) {
/* Assume any BranID >= 8 and family == 15 yields a Pentium 4 */
if (c->x86_brand_id >= 8)
c->x86_processor = X86_PROCESSOR_PENTIUM4;
}
break;
}
if (c->x86_processor == X86_PROCESSOR_max) {
fprintf(stderr, "Error: unknown processor type\n");
fprintf(stderr, " Family : %d\n", c->x86);
fprintf(stderr, " Model : %d\n", c->x86_model);
fprintf(stderr, " Mask : %d\n", c->x86_mask);
if (c->x86_brand_id)
fprintf(stderr, " BrandID : %02x\n", c->x86_brand_id);
abort();
}
/* Have CMOV support? */
have_cmov = (c->x86_hwcap & (1 << 15)) && true;
/* Can the host CPU suffer from partial register stalls? */
have_rat_stall = (c->x86_vendor == X86_VENDOR_INTEL);
#if 1
/* It appears that partial register writes are a bad idea even on
AMD K7 cores, even though they are not supposed to have the
dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */
if (have_cmov)
have_rat_stall=1;
if (c->x86_processor == X86_PROCESSOR_ATHLON)
have_rat_stall = true;
#endif
/* Alignments */
if (tune_alignment) {
align_loops = x86_alignments[c->x86_processor].align_loop;
align_jumps = x86_alignments[c->x86_processor].align_jump;
}
write_log("Max CPUID level=%d Processor is %s [%s]\n",
c->cpuid_level, c->x86_vendor_id,
x86_processor_string_table[c->x86_processor]);
}

View File

@ -88,6 +88,9 @@ static bool lazy_flush = true; // Flag: lazy translation cache invalidation
static bool avoid_fpu = true; // Flag: compile FPU instructions ?
static bool have_cmov = false; // target has CMOV instructions ?
static bool have_rat_stall = true; // target has partial register stalls ?
static bool tune_alignment = false; // Tune code alignments for running CPU ?
static int align_loops = 32; // Align the start of loops
static int align_jumps = 32; // Align the start of jumps
static int zero_fd = -1;
static int optcount[10] = {
10, // How often a block has to be executed before it is translated
@ -104,18 +107,11 @@ struct op_properties {
};
static op_properties prop[65536];
// gb-- Control Flow Predicates
static inline int end_block(uae_u32 opcode)
{
return (prop[opcode].cflow & fl_end_block);
}
static inline bool may_trap(uae_u32 opcode)
{
return (prop[opcode].cflow & fl_trap);
}
uae_u8* start_pc_p;
uae_u32 start_pc;
uae_u32 current_block_pc_p;
@ -4562,6 +4558,7 @@ void compiler_init(void)
raw_init_cpu();
write_log("<JIT compiler> : target processor has CMOV instructions : %s\n", have_cmov ? "yes" : "no");
write_log("<JIT compiler> : target processor can suffer from partial register stalls : %s\n", have_rat_stall ? "yes" : "no");
write_log("<JIT compiler> : alignment for loops, jumps are %d, %d\n", align_loops, align_jumps);
// Translation cache flush mechanism
lazy_flush = PrefsFindBool("jitlazyflush");
@ -5407,54 +5404,55 @@ static __inline__ void create_popalls(void)
registers before jumping back to the various get-out routines.
This generates the code for it.
*/
popall_do_nothing=current_compile_p;
align_target(align_jumps);
popall_do_nothing=get_target();
for (i=0;i<N_REGS;i++) {
if (need_to_preserve[i])
raw_pop_l_r(i);
}
raw_jmp((uae_u32)do_nothing);
align_target(32);
align_target(align_jumps);
popall_execute_normal=get_target();
for (i=0;i<N_REGS;i++) {
if (need_to_preserve[i])
raw_pop_l_r(i);
}
raw_jmp((uae_u32)execute_normal);
align_target(32);
align_target(align_jumps);
popall_cache_miss=get_target();
for (i=0;i<N_REGS;i++) {
if (need_to_preserve[i])
raw_pop_l_r(i);
}
raw_jmp((uae_u32)cache_miss);
align_target(32);
align_target(align_jumps);
popall_recompile_block=get_target();
for (i=0;i<N_REGS;i++) {
if (need_to_preserve[i])
raw_pop_l_r(i);
}
raw_jmp((uae_u32)recompile_block);
align_target(32);
align_target(align_jumps);
popall_exec_nostats=get_target();
for (i=0;i<N_REGS;i++) {
if (need_to_preserve[i])
raw_pop_l_r(i);
}
raw_jmp((uae_u32)exec_nostats);
align_target(32);
align_target(align_jumps);
popall_check_checksum=get_target();
for (i=0;i<N_REGS;i++) {
if (need_to_preserve[i])
raw_pop_l_r(i);
}
raw_jmp((uae_u32)check_checksum);
align_target(32);
align_target(align_jumps);
current_compile_p=get_target();
#else
popall_exec_nostats=(void *)exec_nostats;
@ -5496,19 +5494,17 @@ static void prepare_block(blockinfo* bi)
int i;
set_target(current_compile_p);
align_target(32);
align_target(align_jumps);
bi->direct_pen=(cpuop_func *)get_target();
raw_mov_l_rm(0,(uae_u32)&(bi->pc_p));
raw_mov_l_mr((uae_u32)&regs.pc_p,0);
raw_jmp((uae_u32)popall_execute_normal);
align_target(32);
align_target(align_jumps);
bi->direct_pcc=(cpuop_func *)get_target();
raw_mov_l_rm(0,(uae_u32)&(bi->pc_p));
raw_mov_l_mr((uae_u32)&regs.pc_p,0);
raw_jmp((uae_u32)popall_check_checksum);
align_target(32);
current_compile_p=get_target();
bi->deplist=NULL;
@ -5920,7 +5916,7 @@ static void compile_block(cpu_history* pc_hist, int blocklen)
bi->needed_flags=liveflags[0];
align_target(32);
align_target(align_loops);
was_comp=0;
bi->direct_handler=(cpuop_func *)get_target();
@ -6095,7 +6091,7 @@ static void compile_block(cpu_history* pc_hist, int blocklen)
raw_jmp((uae_u32)popall_do_nothing);
create_jmpdep(bi,0,tba,t1);
align_target(16);
align_target(align_jumps);
/* not-predicted outcome */
*branchadd=(uae_u32)get_target()-((uae_u32)branchadd+4);
live=tmp; /* Ouch again */
@ -6201,7 +6197,7 @@ static void compile_block(cpu_history* pc_hist, int blocklen)
#endif
log_dump();
align_target(32);
align_target(align_jumps);
/* This is the non-direct handler */
bi->handler=
@ -6217,9 +6213,7 @@ static void compile_block(cpu_history* pc_hist, int blocklen)
raw_jmp((uae_u32)bi->direct_handler);
align_target(32);
current_compile_p=get_target();
raise_in_cl_list(bi);
/* We will flush soon, anyway, so let's do it now */