Align PowerPC registers struct manually, i.e. don't depend on non-portable

compiler extensions (e.g. GCC __attribute__((aligned(N)))).
2024-11-23 04:33:24 +00:00 · 2005-12-06 22:25:13 +00:00 · 2005-12-06 22:25:13 +00:00 · dd2b9a95d5
commit dd2b9a95d5
parent d8aa8a7459
6 changed files with 166 additions and 126 deletions
--- a/SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp
+++ b/SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp
@ -177,50 +177,8 @@ public:

 	// Make sure the SIGSEGV handler can access CPU registers
 	friend sigsegv_return_t sigsegv_handler(sigsegv_address_t, sigsegv_address_t);
-
-	// Memory allocator returning areas aligned on 16-byte boundaries
-	void *operator new(size_t size);
-	void operator delete(void *p);
 };

-// Memory allocator returning sheepshaver_cpu objects aligned on 16-byte boundaries
-// FORMAT: [ alignment ] magic identifier, offset to malloc'ed data, sheepshaver_cpu data
-void *sheepshaver_cpu::operator new(size_t size)
-{
-	const int ALIGN = 16;
-
-	// Allocate enough space for sheepshaver_cpu data + signature + align pad
-	uint8 *ptr = (uint8 *)malloc(size + ALIGN * 2);
-	if (ptr == NULL)
-		throw std::bad_alloc();
-
-	// Align memory
-	int ofs = 0;
-	while ((((uintptr)ptr) % ALIGN) != 0)
-		ofs++, ptr++;
-
-	// Insert signature and offset
-	struct aligned_block_t {
-		uint32 pad[(ALIGN - 8) / 4];
-		uint32 signature;
-		uint32 offset;
-		uint8  data[sizeof(sheepshaver_cpu)];
-	};
-	aligned_block_t *blk = (aligned_block_t *)ptr;
-	blk->signature = FOURCC('S','C','P','U');
-	blk->offset = ofs + (&blk->data[0] - (uint8 *)blk);
-	assert((((uintptr)&blk->data) % ALIGN) == 0);
-	return &blk->data[0];
-}
-
-void sheepshaver_cpu::operator delete(void *p)
-{
-	uint32 *blk = (uint32 *)p;
-	assert(blk[-2] == FOURCC('S','C','P','U'));
-	void *ptr = (void *)(((uintptr)p) - blk[-1]);
-	free(ptr);
-}
-
 sheepshaver_cpu::sheepshaver_cpu()
 	: powerpc_cpu(enable_jit_p())
 {
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp
@ -20,6 +20,7 @@

 #include "sysdeps.h"
 #include <stdlib.h>
+#include <assert.h>
 #include "vm_alloc.h"
 #include "cpu/vm.hpp"
 #include "cpu/ppc/ppc-cpu.hpp"
@ -119,6 +120,7 @@ uint32 powerpc_registers::reserve_data = 0;

 void powerpc_cpu::init_registers()
 {
+	assert((((uintptr)&vr(0)) % 16) == 0);
 	for (int i = 0; i < 32; i++) {
 		gpr(i) = 0;
 		fpr(i) = 0;
@ -299,6 +301,44 @@ void powerpc_cpu::initialize()
 #endif
 }

+// Memory allocator returning powerpc_cpu objects aligned on 16-byte boundaries
+// FORMAT: [ alignment ] magic identifier, offset to malloc'ed data, powerpc_cpu data
+void *powerpc_cpu::operator new(size_t size)
+{
+	const int ALIGN = 16;
+
+	// Allocate enough space for powerpc_cpu data + signature + align pad
+	uint8 *ptr = (uint8 *)malloc(size + ALIGN * 2);
+	if (ptr == NULL)
+		throw std::bad_alloc();
+
+	// Align memory
+	int ofs = 0;
+	while ((((uintptr)ptr) % ALIGN) != 0)
+		ofs++, ptr++;
+
+	// Insert signature and offset
+	struct aligned_block_t {
+		uint32 pad[(ALIGN - 8) / 4];
+		uint32 signature;
+		uint32 offset;
+		uint8  data[sizeof(powerpc_cpu)];
+	};
+	aligned_block_t *blk = (aligned_block_t *)ptr;
+	blk->signature = 0x53435055;		/* 'SCPU' */
+	blk->offset = ofs + (&blk->data[0] - (uint8 *)blk);
+	assert((((uintptr)&blk->data) % ALIGN) == 0);
+	return &blk->data[0];
+}
+
+void powerpc_cpu::operator delete(void *p)
+{
+	uint32 *blk = (uint32 *)p;
+	assert(blk[-2] == 0x53435055);		/* 'SCPU' */
+	void *ptr = (void *)(((uintptr)p) - blk[-1]);
+	free(ptr);
+}
+
 #ifdef SHEEPSHAVER
 powerpc_cpu::powerpc_cpu(bool do_use_jit)
 	: use_jit(do_use_jit)
@ -468,9 +508,9 @@ bool powerpc_cpu::check_spcflags()
 		if (!processing_interrupt) {
 			processing_interrupt = true;
 			powerpc_registers r;
-			powerpc_registers::interrupt_copy(r, regs);
+			powerpc_registers::interrupt_copy(r, regs());
 			HandleInterrupt(&r);
-			powerpc_registers::interrupt_copy(regs, r);
+			powerpc_registers::interrupt_copy(regs(), r);
 			processing_interrupt = false;
 		}
 	}
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp
@ -38,7 +38,22 @@ class powerpc_cpu
 	: public basic_cpu
 #endif
 {
-	powerpc_registers regs;
+	// NOTE: PowerPC registers structure shall be aligned on 16-byte
+	// boundaries for the AltiVec registers to be used in native code
+	// with aligned load/stores.
+	//
+	// We can't assume (offsetof(powerpc_cpu, regs) % 16) == 0 since
+	// extra data could be inserted prior regs, e.g. pointer to vtable
+	struct {
+		powerpc_registers regs;
+		uint8 pad[16];
+	} _regs;
+
+	// Make sure the calculation of the current offset makes use of
+	// 'this' as this could make it simplified at compile-time
+	powerpc_registers *regs_ptr() const			{ return (powerpc_registers *)((char *)&_regs.regs + (16 - (((char *)&_regs.regs - (char *)this) % 16))); }
+	powerpc_registers const & regs() const		{ return *regs_ptr(); }
+	powerpc_registers & regs()					{ return *regs_ptr(); }

 #if PPC_PROFILE_REGS_USE
 	// Registers use statistics
@ -59,30 +74,30 @@ private:

 protected:

-	powerpc_spcflags & spcflags() { return regs.spcflags; }
-	powerpc_spcflags const & spcflags() const { return regs.spcflags; }
-	powerpc_cr_register & cr() { return regs.cr; }
-	powerpc_cr_register const & cr() const { return regs.cr; }
-	powerpc_xer_register & xer() { return regs.xer; }
-	powerpc_xer_register const & xer() const { return regs.xer; }
-	powerpc_vscr & vscr() { return regs.vscr; }
-	powerpc_vscr const & vscr() const { return regs.vscr; }
+	powerpc_spcflags & spcflags() { return regs().spcflags; }
+	powerpc_spcflags const & spcflags() const { return regs().spcflags; }
+	powerpc_cr_register & cr() { return regs().cr; }
+	powerpc_cr_register const & cr() const { return regs().cr; }
+	powerpc_xer_register & xer() { return regs().xer; }
+	powerpc_xer_register const & xer() const { return regs().xer; }
+	powerpc_vscr & vscr() { return regs().vscr; }
+	powerpc_vscr const & vscr() const { return regs().vscr; }

-	uint32 vrsave() const		{ return regs.vrsave; }
-	uint32 & vrsave()			{ return regs.vrsave; }
-	double fp_result() const	{ return regs.fp_result.d; }
-	double & fp_result()		{ return regs.fp_result.d; }
-	uint64 fp_result_dw() const	{ return regs.fp_result.j; }
-	uint64 & fp_result_dw()		{ return regs.fp_result.j; }
+	uint32 vrsave() const		{ return regs().vrsave; }
+	uint32 & vrsave()			{ return regs().vrsave; }
+	double fp_result() const	{ return regs().fp_result.d; }
+	double & fp_result()		{ return regs().fp_result.d; }
+	uint64 fp_result_dw() const	{ return regs().fp_result.j; }
+	uint64 & fp_result_dw()		{ return regs().fp_result.j; }

-	uint32 & fpscr()			{ return regs.fpscr; }
-	uint32 fpscr() const		{ return regs.fpscr; }
-	uint32 & lr()				{ return regs.lr; }
-	uint32 lr() const			{ return regs.lr; }
-	uint32 & ctr()				{ return regs.ctr; }
-	uint32 ctr() const			{ return regs.ctr; }
-	uint32 & pc()				{ return regs.pc; }
-	uint32 pc() const			{ return regs.pc; }
+	uint32 & fpscr()			{ return regs().fpscr; }
+	uint32 fpscr() const		{ return regs().fpscr; }
+	uint32 & lr()				{ return regs().lr; }
+	uint32 lr() const			{ return regs().lr; }
+	uint32 & ctr()				{ return regs().ctr; }
+	uint32 ctr() const			{ return regs().ctr; }
+	uint32 & pc()				{ return regs().pc; }
+	uint32 pc() const			{ return regs().pc; }
 	void increment_pc(int o)	{ pc() += o; }

 	friend class pc_operand;
@ -94,14 +109,14 @@ protected:

 public:

-	uint32 & gpr(int i)			{ log_reg(i); return regs.gpr[i]; }
-	uint32 gpr(int i) const		{ log_reg(i); return regs.gpr[i]; }
-	double & fpr(int i)			{ return regs.fpr[i].d; }
-	double fpr(int i) const		{ return regs.fpr[i].d; }
-	uint64 & fpr_dw(int i)		{ return regs.fpr[i].j; }
-	uint64 fpr_dw(int i) const	{ return regs.fpr[i].j; }
-	powerpc_vr & vr(int i)		{ return regs.vr[i]; }
-	powerpc_vr const & vr(int i) const { return regs.vr[i]; }
+	uint32 & gpr(int i)			{ log_reg(i); return regs().gpr[i]; }
+	uint32 gpr(int i) const		{ log_reg(i); return regs().gpr[i]; }
+	double & fpr(int i)			{ return regs().fpr[i].d; }
+	double fpr(int i) const		{ return regs().fpr[i].d; }
+	uint64 & fpr_dw(int i)		{ return regs().fpr[i].j; }
+	uint64 fpr_dw(int i) const	{ return regs().fpr[i].j; }
+	powerpc_vr & vr(int i)		{ return regs().vr[i]; }
+	powerpc_vr const & vr(int i) const { return regs().vr[i]; }

 protected:

@ -258,6 +273,10 @@ public:
 #endif
 	~powerpc_cpu();

+	// Specialised memory allocation (needs to be 16-byte aligned)
+	void *operator new(size_t size);
+	void operator delete(void *p);
+
 	// Handle flight recorder
 #if PPC_FLIGHT_RECORDER
 	bool is_logging() const { return logging; }
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp
@ -735,10 +735,10 @@ template< class RA >
 void powerpc_cpu::execute_lwarx(uint32 opcode)
 {
 	const uint32 ea = RA::get(this, opcode) + operand_RB::get(this, opcode);
-	regs.reserve_valid = 1;
-	regs.reserve_addr = ea;
-	regs.reserve_data = vm_read_memory_4(ea);
-	operand_RD::set(this, opcode, regs.reserve_data);
+	regs().reserve_valid = 1;
+	regs().reserve_addr = ea;
+	regs().reserve_data = vm_read_memory_4(ea);
+	operand_RD::set(this, opcode, regs().reserve_data);
 	increment_pc(4);
 }

@ -747,13 +747,13 @@ void powerpc_cpu::execute_stwcx(uint32 opcode)
 {
 	const uint32 ea = RA::get(this, opcode) + operand_RB::get(this, opcode);
 	cr().clear(0);
-	if (regs.reserve_valid) {
-		if (regs.reserve_addr == ea /* physical_addr(EA) */
-			&& /* HACK */ regs.reserve_data == vm_read_memory_4(ea)) {
+	if (regs().reserve_valid) {
+		if (regs().reserve_addr == ea /* physical_addr(EA) */
+			&& /* HACK */ regs().reserve_data == vm_read_memory_4(ea)) {
 			vm_write_memory_4(ea, operand_RS::get(this, opcode));
 			cr().set(0, standalone_CR_EQ_field::mask());
 		}
-		regs.reserve_valid = 0;
+		regs().reserve_valid = 0;
 	}
 	cr().set_so(0, xer().get_so());
 	increment_pc(4);
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp
@ -210,13 +210,7 @@ union powerpc_vr
 	uint32	w[4];
 	uint64	j[2];
 	float	f[4];
-}
-#if defined(__GNUC__)
-// 16-byte alignment is required for SIMD optimizations operating on
-// 128-bit aligned registers (e.g. SSE).
-__attribute__((aligned(16)))
-#endif
-;
+};


 /**
@ -253,8 +247,8 @@ struct powerpc_registers
 	powerpc_fpr fpr[32];		// Floating-Point Registers
 	powerpc_fpr	fp_result;		// Floating-Point result
 	powerpc_cr_register cr;		// Condition Register
-	uint32	fpscr;				// Floating-Point Status and Control Register
 	powerpc_xer_register xer;	// XER Register (SPR 1)
+	uint32	fpscr;				// Floating-Point Status and Control Register
 	uint32	lr;					// Link Register (SPR 8)
 	uint32	ctr;				// Count Register (SPR 9)
 	uint32	pc;					// Program Counter
@ -262,6 +256,9 @@ struct powerpc_registers
 	static uint32 reserve_valid;
 	static uint32 reserve_addr;
 	static uint32 reserve_data;
+#define PPC_SZ(T) sizeof(powerpc_##T)
+	uint8 _pad[16 - ((PPC_SZ(fpr) + PPC_SZ(cr_register) + PPC_SZ(xer_register) + PPC_SZ(spcflags)) % 16)];
+#undef  PPC_SZ
 	powerpc_vr vr[32];			// Vector Registers
 	powerpc_vscr vscr;			// Vector Status and Control Register
 	uint32 vrsave;				// AltiVec Save Register
--- a/SheepShaver/src/kpx_cpu/src/test/test-powerpc.cpp
+++ b/SheepShaver/src/kpx_cpu/src/test/test-powerpc.cpp
@ -104,14 +104,6 @@ typedef uintptr_t uintptr;
 #define TEST_VMX_ARITH	1


-#if defined __GNUC__
-#define ALIGNED(N) __attribute__((aligned(N)))
-#else
-#if TEST_VMX_OPS
-#error "AltiVec testing requires the align attribute"
-#endif
-#endif
-
 // Partial PowerPC runtime assembler from GNU lightning
 #undef  _I
 #define _I(X)			((uint32)(X))
@ -506,7 +498,25 @@ typedef bit_field<  2,  2 > XER_CA_field;
 static bool has_altivec = true;

 // A 128-bit AltiVec register
-typedef uint8 vector_t[16] ALIGNED(16);
+typedef uint8 vector_t[16];
+
+class aligned_vector_t {
+	struct {
+		vector_t v;
+		uint8 pad[16];
+	} vs;
+public:
+	aligned_vector_t()
+		{ memset(addr(), 0, sizeof(vector_t)); }
+	aligned_vector_t(vector_t const & vi)
+		{ memcpy(addr(), &vi, sizeof(vector_t)); }
+	vector_t *addr() const
+		{ return (vector_t *)((char *)&vs.v + (16 - (((char *)&vs.v - (char *)this) % 16))); }
+	vector_t const & value() const
+		{ return *addr(); }
+	vector_t & value()
+		{ return *addr(); }
+};

 union vector_helper_t {
 	vector_t v;
@ -721,7 +731,7 @@ private:

 	struct vector_value_t {
 		char type;
-		vector_t v ALIGNED(16);
+		vector_t v;
 	};

 	static const uint32 reg_values[];
@ -1611,24 +1621,25 @@ void powerpc_test_cpu::test_one_vector(uint32 *code, vector_test_t const & vt, u
 #endif

 	// Invoke emulated code
-	static vector_t emul_vD;
-	memset(&emul_vD, 0, sizeof(emul_vD));
-	static vector_helper_t emul_vSCR;
-	memset(&emul_vSCR, 0, sizeof(emul_vSCR));
-	emul_vSCR.w[3] = 0;
+	static aligned_vector_t emul_vD;
+	memset(emul_vD.addr(), 0, sizeof(vector_t));
+	static aligned_vector_t emul_vSCR;
+	memset(emul_vSCR.addr(), 0, sizeof(vector_t));
 	emul_set_cr(init_cr);
-	set_gpr(RD, (uintptr)&emul_vD);
+	set_gpr(RD, (uintptr)emul_vD.addr());
 	set_gpr(RA, (uintptr)rAp);
 	set_gpr(RB, (uintptr)rBp);
 	set_gpr(RC, (uintptr)rCp);
-	set_gpr(VSCR, (uintptr)emul_vSCR.b);
+	set_gpr(VSCR, (uintptr)emul_vSCR.addr());
 	execute(code);
+	vector_helper_t emul_vSCR_helper;
+	memcpy(&emul_vSCR_helper, emul_vSCR.addr(), sizeof(vector_t));
 	const uint32 emul_cr = emul_get_cr();
-	const uint32 emul_vscr = ntohl(emul_vSCR.w[3]);
+	const uint32 emul_vscr = ntohl(emul_vSCR_helper.w[3]);

 	++tests;

-	bool ok = vector_equals(vt.type, native_vD, emul_vD)
+	bool ok = vector_equals(vt.type, native_vD, emul_vD.value())
 		&& native_cr == emul_cr
 		&& native_vscr == emul_vscr;

@ -1676,7 +1687,7 @@ void powerpc_test_cpu::test_one_vector(uint32 *code, vector_test_t const & vt, u
 		print_vector(native_vD, vt.type);
 		printf("\n");
 		printf("vD.E = ");
-		print_vector(emul_vD, vt.type);
+		print_vector(emul_vD.value(), vt.type);
 		printf("\n");
 		printf("CR.N = %08x ; VSCR.N = %08x\n", native_cr, native_vscr);
 		printf("CR.E = %08x ; VSCR.E = %08x\n", emul_cr, emul_vscr);
@ -1764,7 +1775,6 @@ void powerpc_test_cpu::test_vector_load(void)
 		}
 	}
 	assert(i_opcode != -1);
-	assert(((uintptr)&vector_values[0].v) % 16 == 0);

 	const int n_elements = sizeof(tests) / sizeof(tests[0]);
 	for (int i = 0; i < n_elements; i++) {
@ -1778,18 +1788,19 @@ void powerpc_test_cpu::test_vector_load(void)
 		printf("Testing %s\n", vt.name);
 		const int n_vector_values = sizeof(vector_values)/sizeof(vector_values[0]);
 		for (int j = 0; j < n_vector_values; j++) {
+			aligned_vector_t av(vector_values[j].v);
 			switch (vt.type) {
 			case 'b':
 				for (int k = 0; k < 16; k++)
-					test_one_vector(code, vt, ((uint8 *)&vector_values[j].v) + 1 * k);
+					test_one_vector(code, vt, ((uint8 *)av.addr()) + 1 * k);
 				break;
 			case 'h':
 				for (int k = 0; k < 8; k++)
-					test_one_vector(code, vt, ((uint8 *)&vector_values[j].v) + 2 * k);
+					test_one_vector(code, vt, ((uint8 *)av.addr()) + 2 * k);
 				break;
 			case 'w':
 				for (int k = 0; k < 4; k++)
-					test_one_vector(code, vt, ((uint8 *)&vector_values[j].v) + 4 * k);
+					test_one_vector(code, vt, ((uint8 *)av.addr()) + 4 * k);
 				break;
 			}
 		}
@ -2012,29 +2023,40 @@ void powerpc_test_cpu::test_vector_arith(void)

 		printf("Testing %s\n", vt.name);
 		if (vt.operands[1] == vA && vt.operands[2] == vB && vt.operands[3] == vC) {
-			for (int i = 0; i < n_vector_values; i++)
-				for (int j = 0; j < n_vector_values; j++)
-					for (int k = 0; k < n_vector_values; k++)
-						test_one_vector(code, vt, &vvp[i].v, &vvp[j].v, &vvp[k].v);
+			for (int i = 0; i < n_vector_values; i++) {
+				aligned_vector_t avi(vvp[i].v);
+				for (int j = 0; j < n_vector_values; j++) {
+					aligned_vector_t avj(vvp[j].v);
+					for (int k = 0; k < n_vector_values; k++) {
+						aligned_vector_t avk(vvp[k].v);
+						test_one_vector(code, vt, avi.addr(), avj.addr(), avk.addr());
+					}
+				}
+			}
 		}
 		else if (vt.operands[1] == vA && vt.operands[2] == vB && vt.operands[3] == vN) {
 			for (int i = 0; i < 16; i++) {
 				vSH_field::insert(vt.opcode, i);
 				code[i_opcode] = vt.opcode;
 				flush_icache_range(code, sizeof(code));
-				for (int j = 0; j < n_vector_values; j++)
+				aligned_vector_t avi(vvp[i].v);
+				for (int j = 0; j < n_vector_values; j++) {
+					aligned_vector_t avj(vvp[j].v);
 					for (int k = 0; k < n_vector_values; k++)
-						test_one_vector(code, vt, &vvp[i].v, &vvp[j].v);
+						test_one_vector(code, vt, avi.addr(), avj.addr());
+				}
 			}
 		}
 		else if (vt.operands[1] == vA && vt.operands[2] == vB) {
 			for (int i = 0; i < n_vector_values; i++) {
+				aligned_vector_t avi(vvp[i].v);
 				for (int j = 0; j < n_vector_values; j++) {
 					if (op_type == 'B') {
 						if (!vector_all_eq('b', vvp[j].v))
 							continue;
 					}
-					test_one_vector(code, vt, &vvp[i].v, &vvp[j].v);
+					aligned_vector_t avj(vvp[j].v);
+					test_one_vector(code, vt, avi.addr(), avj.addr());
 				}
 			}
 		}
@ -2043,8 +2065,10 @@ void powerpc_test_cpu::test_vector_arith(void)
 				rA_field::insert(vt.opcode, i);
 				code[i_opcode] = vt.opcode;
 				flush_icache_range(code, sizeof(code));
-				for (int j = 0; j < n_vector_values; j++)
-					test_one_vector(code, vt, NULL, &vvp[j].v);
+				for (int j = 0; j < n_vector_values; j++) {
+					aligned_vector_t avj(vvp[j].v);
+					test_one_vector(code, vt, NULL, avj.addr());
+				}
 			}
 		}
 		else if (vt.operands[1] == vI) {
@ -2056,8 +2080,10 @@ void powerpc_test_cpu::test_vector_arith(void)
 			}
 		}
 		else if (vt.operands[1] == __ && vt.operands[2] == vB) {
-			for (int i = 0; i < n_vector_values; i++)
-				test_one_vector(code, vt, NULL, &vvp[i].v);
+			for (int i = 0; i < n_vector_values; i++) {
+				aligned_vector_t avi(vvp[i].v);
+				test_one_vector(code, vt, NULL, avi.addr());
+			}
 		}
 		else {
 			printf("ERROR: unhandled test case\n");