WIP: changes to cpu throttling/timing

WARN: this is still a WIP and untested.  Instead of nanosleep() the cpu thread
now uses an adaptive spinloop.  This better emulates the actual apple //e speed
in preparation to do near-realtime audio.  The drawback is that the cpu thread
is pegged at 100% usage.
This commit is contained in:
Aaron Culliney 2013-07-21 17:20:03 -07:00
parent d599a2174a
commit 96d083a2c3
5 changed files with 154 additions and 85 deletions

View File

@ -32,8 +32,9 @@
#define DebugCurrEA SN(cpu65_debug)
#define DebugCurrByte SN(cpu65_debug)+2
#define DebugCurrOp SN(cpu65_debug)+3
#define XCyclesCount SN(cpu65_debug)+4
#define DebugCurrRW SN(cpu65_debug)+3
#define DebugCycleCount SN(cpu65_debug)+4
#define DebugCurrOpcode SN(cpu65_debug)+5
/* -------------------------------------------------------------------------
CPU (6502) Helper Routines
@ -57,7 +58,7 @@
(,EffectiveAddr_E,8); \
#define GetFromEA_B \
orb $1, DebugCurrOp; \
orb $1, DebugCurrRW; \
call *SN(cpu65_vmem) \
(,EffectiveAddr_E,8);
@ -71,7 +72,7 @@
(,EffectiveAddr_E,8);
#define PutToEA_B \
orb $2, DebugCurrOp; \
orb $2, DebugCurrRW; \
orb %al, DebugCurrByte; \
call *SN(cpu65_vmem)+4 \
(,EffectiveAddr_E,8);
@ -90,10 +91,6 @@
call *SN(cpu65_vmem) \
(,EffectiveAddr_E,8); \
// reset operation code before each instruction
#define ZeroOp movb $0, DebugCurrOp; \
movb $0, XCyclesCount;
// NOTE: the orb functions as a move, but we want to
// set the flags and we know %ah is zero
#define Continue \
@ -139,12 +136,12 @@
9:
#define BranchXCycles \
incb XCyclesCount; /* +1 cycle branch taken */ \
incb DebugCycleCount; /* +1 branch taken */ \
pushl %ebx; \
movw PC_Reg, %bx; \
addb %al, %bl; \
jnc 9f; \
incb XCyclesCount; /* +1 cycle branch across pg boundary */ \
incb DebugCycleCount; /* +1 branch across pg boundary */ \
9: addw %ax, PC_Reg; \
popl %ebx;
@ -234,7 +231,7 @@
addb X_Reg, %al; \
jnc 9f; \
adcb $0, %ah; \
incb XCyclesCount; /* +1 cycle on page boundary */ \
incb DebugCycleCount; /* +1 cycle on page boundary */ \
9: movl %eax, EffectiveAddr_E;
#define GetAbs_Y \
@ -242,7 +239,7 @@
addb Y_Reg, %al; \
jnc 9f; \
adcb $0, %ah; \
incb XCyclesCount; /* +1 cycle on page boundary */ \
incb DebugCycleCount; /* +1 cycle on page boundary */ \
9: movl %eax, EffectiveAddr_E;
/* Absolute Indirect Addressing - The second and third bytes of the
@ -309,7 +306,7 @@
addb Y_Reg, %al; \
jnc 9f; \
adcb $0, %ah; \
incb XCyclesCount; /* +1 cycle on page boundary */ \
incb DebugCycleCount; /* +1 cycle on page boundary */ \
9: movl %eax, EffectiveAddr_E;
#define DoADC_b GetFromEA_B \
@ -464,7 +461,7 @@
// Decimal mode
op_ADC_dec:
incb XCyclesCount // +1 cycle
incb DebugCycleCount // +1 cycle
DoADC_d
Continue
@ -1590,7 +1587,7 @@ op_RTS:
---------------------------------- */
op_SBC_dec:
incb XCyclesCount // +1 cycle
incb DebugCycleCount // +1 cycle
DoSBC_d
Continue
@ -1727,7 +1724,7 @@ op_STA_imm:
op_STA_zpage:
GetZPage
DoSTA
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
op_STA_zpage_x:
@ -1747,13 +1744,13 @@ op_STA_abs:
op_STA_abs_x:
GetAbs_X
DoSTA
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
op_STA_abs_y:
GetAbs_Y
DoSTA
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
op_STA_ind_x:
@ -1764,7 +1761,7 @@ op_STA_ind_x:
op_STA_ind_y:
GetIndZPage_Y
DoSTA
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
// 65c02 : 0x92
@ -1817,7 +1814,7 @@ op_RMB7_65c02:
op_STX_zpage:
GetZPage
DoSTX
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
// HACK : is this used? need to study coverage ...
@ -1838,7 +1835,7 @@ op_STX_abs:
op_STY_zpage:
GetZPage
DoSTY
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
op_STY_zpage_x:
@ -1860,7 +1857,7 @@ op_STY_abs:
op_STZ_zpage:
GetZPage
DoSTZ
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
// 65c02 : 0x74
@ -1878,7 +1875,7 @@ op_STZ_abs:
op_STZ_abs_x:
GetAbs_X
DoSTZ
incb XCyclesCount // +1 cycle on write
incb DebugCycleCount // +1 cycle on write
Continue
/* ----------------------------------
@ -2664,8 +2661,10 @@ continue: SaveState
call SN(timing_throttle)
ReplaceState
xorb %ah, %ah
ZeroOp
movb $0, DebugCurrRW
movb $0, DebugCycleCount
GetFromPC_B
movb $al, DebugCurrOpcode
jmp *cpu65__opcodes(,%eax,4)
/* Exception handler */

View File

@ -43,8 +43,9 @@ struct cpu65_extra
{
uint16_t ea; /* Last effective address */
uint8_t d; /* Last data byte written */
uint8_t op; /* 1 = read occured, 2 = write, 3 = both */
uint8_t xcycles; /* Last opcode extra cycles */
uint8_t rw; /* 1 = read occured, 2 = write, 3 = both */
uint8_t opcode; /* Last opcode */
uint8_t opcycles; /* Last opcode extra cycles */
};
/* 6502 CPU models */

View File

@ -732,17 +732,19 @@ void c_read_random() {
static void cpu_thread(void *dummyptr) {
do
{
LOG("cpu_thread : entering cpu65_run()...");
cpu65_run();
reinitialize();
} while (1);
}
static void main_thread(void *dummyptr) {
struct timespec abstime = { .tv_sec=0, .tv_nsec=8333333 }; // 120Hz
do
{
// sleep waiting for the cpu thread to ping us that it's sleeping...
// sleep waiting for the cpu thread to ping us to render
pthread_mutex_lock(&mutex);
pthread_cond_wait(&cond, &mutex);
pthread_cond_timedwait(&cond, &mutex, &abstime);
pthread_mutex_unlock(&mutex);
c_periodic_update(0);

View File

@ -17,16 +17,20 @@
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <limits.h>
#define DEFAULT_SLEEP 120
#define CALIBRATE_HZ 120
static unsigned int sleep_hz = DEFAULT_SLEEP; // sleep intervals per sec
static unsigned long cpu_target_hz = APPLE2_HZ; // target clock speed
static unsigned long cycles_interval = APPLE2_HZ / DEFAULT_SLEEP; // Number of 65c02 instructions to be executed at sleep_hz
static unsigned long processing_interval = NANOSECONDS / DEFAULT_SLEEP; // Number of nanoseconds in sleep_hz intervals
static unsigned long calibrate_interval = NANOSECONDS / CALIBRATE_HZ; // calibration interval for drifting
static unsigned long cycle_nanoseconds = NANOSECONDS / APPLE2_HZ; // nanosecs per cycle
static unsigned int cycle_nanoseconds_count;
static struct timespec deltat, t0, ti, tj;
static unsigned long cycle=0;
static unsigned long cycle_count=0; // CPU cycle counter
static int spinloop_count=0; // spin loop counter
static long sleep_adjust=0;
static long sleep_adjust_inc=0;
@ -58,73 +62,137 @@ static inline long timespec_nsecs(struct timespec t) {
return t.tv_sec*NANOSECONDS + t.tv_nsec;
}
// spin loop to throttle to target CPU Hz
static inline void _spin_loop(unsigned long c)
{
static volatile unsigned int spinney=0; // volatile to prevent being optimized away
for (unsigned long i=0; i<c; i++)
{
++spinney;
}
}
static void _determine_initial_spinloop_counter()
{
struct timespec s0, s1;
// time the spinloop to determine a good starting value for the spin counter
unsigned long avg_spin_nsecs = 0;
unsigned int const samples = 5;
unsigned int i=0;
spinloop_count = 500000000;
do
{
clock_gettime(CLOCK_MONOTONIC, &s0);
_spin_loop(spinloop_count);
clock_gettime(CLOCK_MONOTONIC, &s1);
deltat = timespec_diff(s0, s1);
if (deltat.tv_sec > 0)
{
printf("oops long wait (>= %lu sec) adjusting loop count (%d -> %d)\n", deltat.tv_sec, spinloop_count, spinloop_count>>1);
spinloop_count >>= 1;
i = 0;
avg_spin_nsecs = 0;
continue;
}
printf("spinloop = %lu nsec\n", deltat.tv_nsec);
avg_spin_nsecs += deltat.tv_nsec;
++i;
} while (i<samples);
avg_spin_nsecs = (avg_spin_nsecs / samples);
printf("average = %lu nsec\n", avg_spin_nsecs);
spinloop_count = cycle_nanoseconds * spinloop_count / avg_spin_nsecs;
cycle_nanoseconds_count = cycle_nanoseconds / spinloop_count;
printf("counter for a single cycle = %d\n", spinloop_count);
}
void timing_initialize() {
// should do this only on startup
_determine_initial_spinloop_counter();
clock_gettime(CLOCK_MONOTONIC, &t0);
ti=t0;
}
void timing_set_cpu_target_hz(unsigned long hz) {
cpu_target_hz = hz;
}
void timing_set_sleep_hz(unsigned int hz) {
sleep_hz = hz;
void timing_set_cpu_scale(unsigned int scale)
{
// ...
}
/*
* Throttles the 65c02 CPU down to a target frequency of X.
* Currently set to target the Apple //e @ 1.02MHz
* Throttles 6502 CPU down to the target CPU frequency (default is speed of original Apple //e).
*
* This uses an adaptive spin loop to stay closer to the target CPU frequency.
*
* This is called from cpu65_run() on the cpu-thread
*/
void timing_throttle() {
++cycle;
void timing_throttle()
{
static unsigned int drift_interval_counter=0; // in nsecs since last
static unsigned int instruction_interval_counter=0; // instruction count since last
static unsigned int spin_adjust_interval=INT_MAX;
static int8_t spin_adjust_count=0; // +/- 1
static time_t severe_lag=0;
++instruction_interval_counter;
if ((cycle%cycles_interval) == 0)
unsigned int opcycles = cpu65__opcycles[cpu65_debug.opcode] + cpu65_debug.opcycles;
if (!opcycles)
{
opcycles = 2; // assume 2 cycles for UNK opcodes
}
cycle_count += opcycles;
// wake render thread as we go to sleep
pthread_mutex_lock(&mutex);
pthread_cond_signal(&cond);
pthread_mutex_unlock(&mutex);
int8_t c = instruction_interval_counter%spin_adjust_interval ? spin_adjust_count : 0;
_spin_loop(opcycles * (spinloop_count + c) );
drift_interval_counter += c*cycle_nanoseconds;
if (drift_interval_counter < calibrate_interval)
{
return;
}
// -------------------------------------------------------------------------
// calibrate emulator clock to real clock ...
clock_gettime(CLOCK_MONOTONIC, &tj);
deltat = timespec_diff(ti, tj);
ti=tj;
if (deltat.tv_sec != 0)
// NOTE: these calculations could overflow if emulator speed is severely dampened back...
unsigned long real_counter = NANOSECONDS * deltat.tv_sec;
real_counter += deltat.tv_nsec;
long diff_nsecs = real_counter - drift_interval_counter; // whole +/- nsec diff
float nsecs_per_oneloop = cycle_nanoseconds/(float)spinloop_count;
unsigned int instruction_interval_nsecs = instruction_interval_counter * nsecs_per_oneloop;
// reset
drift_interval_counter=0;
instruction_interval_counter=0;
// calculate spin adjustment
if (diff_nsecs == 0)
{
// severely lagging, don't bother sleeping ...
if (severe_lag < time(NULL))
{
severe_lag = time(NULL)+2;
fprintf(stderr, "Severe lag detected...\n");
// nothing to do
}
else if (abs(diff_nsecs) > instruction_interval_nsecs)
{
// spin for additional +/- X each instruction
spinloop_count += diff_nsecs / instruction_interval_nsecs;
spin_adjust_interval=INT_MAX;
}
else
{
deltat.tv_nsec = processing_interval - deltat.tv_nsec + sleep_adjust_inc;
nanosleep(&deltat, NULL); // NOTE: spec says will return right away if deltat.tv_nsec value < 0 ...
ti.tv_nsec += deltat.tv_nsec;
}
if ((cycle%cpu_target_hz) == 0)
{
clock_gettime(CLOCK_MONOTONIC, &tj);
deltat = timespec_diff(t0, tj);
struct timespec t = (struct timespec) {.tv_sec=1, .tv_nsec=0 };
long adj = (deltat.tv_sec == 0)
? timespec_nsecs(timespec_diff(deltat, t))
: -1 * timespec_nsecs(timespec_diff(t, deltat));
sleep_adjust += adj;
sleep_adjust_inc = sleep_adjust/sleep_hz;
t0=tj;
ti=t0;
}
// sub adjustment : spin for additional +/- 1 every interval
spin_adjust_count = diff_nsecs < 0 ? -1 : 1;
spin_adjust_interval = instruction_interval_nsecs / abs(diff_nsecs);
}
}

View File

@ -13,12 +13,11 @@
#ifndef _TIMING_H_
#define _TIMING_H_
#define APPLE2_HZ 2040000
#define APPLE2_HZ 1020000
#define NANOSECONDS 1000000000
void timing_set_cpu_target_hz(unsigned long hz);
void timing_set_sleep_hz(unsigned int hz);
// 0 = run as fast as possible, 1 = approximate apple, X = 1/X rate
void timing_set_cpu_scale(unsigned int scale);
void timing_initialize();