From dd2b9a95d5c51866c5fb9571c9dfd3ec8e11a246 Mon Sep 17 00:00:00 2001 From: gbeauche <> Date: Tue, 6 Dec 2005 22:25:13 +0000 Subject: [PATCH] Align PowerPC registers struct manually, i.e. don't depend on non-portable compiler extensions (e.g. GCC __attribute__((aligned(N)))). --- SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp | 42 -------- .../src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp | 44 ++++++++- .../src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp | 81 ++++++++++------ .../src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp | 16 ++-- .../src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp | 13 +-- .../src/kpx_cpu/src/test/test-powerpc.cpp | 96 ++++++++++++------- 6 files changed, 166 insertions(+), 126 deletions(-) diff --git a/SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp b/SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp index 9f8c5eb1..6f20f39e 100644 --- a/SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp +++ b/SheepShaver/src/kpx_cpu/sheepshaver_glue.cpp @@ -177,50 +177,8 @@ public: // Make sure the SIGSEGV handler can access CPU registers friend sigsegv_return_t sigsegv_handler(sigsegv_address_t, sigsegv_address_t); - - // Memory allocator returning areas aligned on 16-byte boundaries - void *operator new(size_t size); - void operator delete(void *p); }; -// Memory allocator returning sheepshaver_cpu objects aligned on 16-byte boundaries -// FORMAT: [ alignment ] magic identifier, offset to malloc'ed data, sheepshaver_cpu data -void *sheepshaver_cpu::operator new(size_t size) -{ - const int ALIGN = 16; - - // Allocate enough space for sheepshaver_cpu data + signature + align pad - uint8 *ptr = (uint8 *)malloc(size + ALIGN * 2); - if (ptr == NULL) - throw std::bad_alloc(); - - // Align memory - int ofs = 0; - while ((((uintptr)ptr) % ALIGN) != 0) - ofs++, ptr++; - - // Insert signature and offset - struct aligned_block_t { - uint32 pad[(ALIGN - 8) / 4]; - uint32 signature; - uint32 offset; - uint8 data[sizeof(sheepshaver_cpu)]; - }; - aligned_block_t *blk = (aligned_block_t *)ptr; - blk->signature = FOURCC('S','C','P','U'); - blk->offset = ofs + (&blk->data[0] - (uint8 *)blk); - assert((((uintptr)&blk->data) % ALIGN) == 0); - return &blk->data[0]; -} - -void sheepshaver_cpu::operator delete(void *p) -{ - uint32 *blk = (uint32 *)p; - assert(blk[-2] == FOURCC('S','C','P','U')); - void *ptr = (void *)(((uintptr)p) - blk[-1]); - free(ptr); -} - sheepshaver_cpu::sheepshaver_cpu() : powerpc_cpu(enable_jit_p()) { diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp index 8588569b..f86b9538 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.cpp @@ -20,6 +20,7 @@ #include "sysdeps.h" #include +#include #include "vm_alloc.h" #include "cpu/vm.hpp" #include "cpu/ppc/ppc-cpu.hpp" @@ -119,6 +120,7 @@ uint32 powerpc_registers::reserve_data = 0; void powerpc_cpu::init_registers() { + assert((((uintptr)&vr(0)) % 16) == 0); for (int i = 0; i < 32; i++) { gpr(i) = 0; fpr(i) = 0; @@ -299,6 +301,44 @@ void powerpc_cpu::initialize() #endif } +// Memory allocator returning powerpc_cpu objects aligned on 16-byte boundaries +// FORMAT: [ alignment ] magic identifier, offset to malloc'ed data, powerpc_cpu data +void *powerpc_cpu::operator new(size_t size) +{ + const int ALIGN = 16; + + // Allocate enough space for powerpc_cpu data + signature + align pad + uint8 *ptr = (uint8 *)malloc(size + ALIGN * 2); + if (ptr == NULL) + throw std::bad_alloc(); + + // Align memory + int ofs = 0; + while ((((uintptr)ptr) % ALIGN) != 0) + ofs++, ptr++; + + // Insert signature and offset + struct aligned_block_t { + uint32 pad[(ALIGN - 8) / 4]; + uint32 signature; + uint32 offset; + uint8 data[sizeof(powerpc_cpu)]; + }; + aligned_block_t *blk = (aligned_block_t *)ptr; + blk->signature = 0x53435055; /* 'SCPU' */ + blk->offset = ofs + (&blk->data[0] - (uint8 *)blk); + assert((((uintptr)&blk->data) % ALIGN) == 0); + return &blk->data[0]; +} + +void powerpc_cpu::operator delete(void *p) +{ + uint32 *blk = (uint32 *)p; + assert(blk[-2] == 0x53435055); /* 'SCPU' */ + void *ptr = (void *)(((uintptr)p) - blk[-1]); + free(ptr); +} + #ifdef SHEEPSHAVER powerpc_cpu::powerpc_cpu(bool do_use_jit) : use_jit(do_use_jit) @@ -468,9 +508,9 @@ bool powerpc_cpu::check_spcflags() if (!processing_interrupt) { processing_interrupt = true; powerpc_registers r; - powerpc_registers::interrupt_copy(r, regs); + powerpc_registers::interrupt_copy(r, regs()); HandleInterrupt(&r); - powerpc_registers::interrupt_copy(regs, r); + powerpc_registers::interrupt_copy(regs(), r); processing_interrupt = false; } } diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp index e68342dd..97060b77 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp @@ -38,7 +38,22 @@ class powerpc_cpu : public basic_cpu #endif { - powerpc_registers regs; + // NOTE: PowerPC registers structure shall be aligned on 16-byte + // boundaries for the AltiVec registers to be used in native code + // with aligned load/stores. + // + // We can't assume (offsetof(powerpc_cpu, regs) % 16) == 0 since + // extra data could be inserted prior regs, e.g. pointer to vtable + struct { + powerpc_registers regs; + uint8 pad[16]; + } _regs; + + // Make sure the calculation of the current offset makes use of + // 'this' as this could make it simplified at compile-time + powerpc_registers *regs_ptr() const { return (powerpc_registers *)((char *)&_regs.regs + (16 - (((char *)&_regs.regs - (char *)this) % 16))); } + powerpc_registers const & regs() const { return *regs_ptr(); } + powerpc_registers & regs() { return *regs_ptr(); } #if PPC_PROFILE_REGS_USE // Registers use statistics @@ -59,30 +74,30 @@ private: protected: - powerpc_spcflags & spcflags() { return regs.spcflags; } - powerpc_spcflags const & spcflags() const { return regs.spcflags; } - powerpc_cr_register & cr() { return regs.cr; } - powerpc_cr_register const & cr() const { return regs.cr; } - powerpc_xer_register & xer() { return regs.xer; } - powerpc_xer_register const & xer() const { return regs.xer; } - powerpc_vscr & vscr() { return regs.vscr; } - powerpc_vscr const & vscr() const { return regs.vscr; } + powerpc_spcflags & spcflags() { return regs().spcflags; } + powerpc_spcflags const & spcflags() const { return regs().spcflags; } + powerpc_cr_register & cr() { return regs().cr; } + powerpc_cr_register const & cr() const { return regs().cr; } + powerpc_xer_register & xer() { return regs().xer; } + powerpc_xer_register const & xer() const { return regs().xer; } + powerpc_vscr & vscr() { return regs().vscr; } + powerpc_vscr const & vscr() const { return regs().vscr; } - uint32 vrsave() const { return regs.vrsave; } - uint32 & vrsave() { return regs.vrsave; } - double fp_result() const { return regs.fp_result.d; } - double & fp_result() { return regs.fp_result.d; } - uint64 fp_result_dw() const { return regs.fp_result.j; } - uint64 & fp_result_dw() { return regs.fp_result.j; } + uint32 vrsave() const { return regs().vrsave; } + uint32 & vrsave() { return regs().vrsave; } + double fp_result() const { return regs().fp_result.d; } + double & fp_result() { return regs().fp_result.d; } + uint64 fp_result_dw() const { return regs().fp_result.j; } + uint64 & fp_result_dw() { return regs().fp_result.j; } - uint32 & fpscr() { return regs.fpscr; } - uint32 fpscr() const { return regs.fpscr; } - uint32 & lr() { return regs.lr; } - uint32 lr() const { return regs.lr; } - uint32 & ctr() { return regs.ctr; } - uint32 ctr() const { return regs.ctr; } - uint32 & pc() { return regs.pc; } - uint32 pc() const { return regs.pc; } + uint32 & fpscr() { return regs().fpscr; } + uint32 fpscr() const { return regs().fpscr; } + uint32 & lr() { return regs().lr; } + uint32 lr() const { return regs().lr; } + uint32 & ctr() { return regs().ctr; } + uint32 ctr() const { return regs().ctr; } + uint32 & pc() { return regs().pc; } + uint32 pc() const { return regs().pc; } void increment_pc(int o) { pc() += o; } friend class pc_operand; @@ -94,14 +109,14 @@ protected: public: - uint32 & gpr(int i) { log_reg(i); return regs.gpr[i]; } - uint32 gpr(int i) const { log_reg(i); return regs.gpr[i]; } - double & fpr(int i) { return regs.fpr[i].d; } - double fpr(int i) const { return regs.fpr[i].d; } - uint64 & fpr_dw(int i) { return regs.fpr[i].j; } - uint64 fpr_dw(int i) const { return regs.fpr[i].j; } - powerpc_vr & vr(int i) { return regs.vr[i]; } - powerpc_vr const & vr(int i) const { return regs.vr[i]; } + uint32 & gpr(int i) { log_reg(i); return regs().gpr[i]; } + uint32 gpr(int i) const { log_reg(i); return regs().gpr[i]; } + double & fpr(int i) { return regs().fpr[i].d; } + double fpr(int i) const { return regs().fpr[i].d; } + uint64 & fpr_dw(int i) { return regs().fpr[i].j; } + uint64 fpr_dw(int i) const { return regs().fpr[i].j; } + powerpc_vr & vr(int i) { return regs().vr[i]; } + powerpc_vr const & vr(int i) const { return regs().vr[i]; } protected: @@ -258,6 +273,10 @@ public: #endif ~powerpc_cpu(); + // Specialised memory allocation (needs to be 16-byte aligned) + void *operator new(size_t size); + void operator delete(void *p); + // Handle flight recorder #if PPC_FLIGHT_RECORDER bool is_logging() const { return logging; } diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp index d6a174f7..72ff2aef 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-execute.cpp @@ -735,10 +735,10 @@ template< class RA > void powerpc_cpu::execute_lwarx(uint32 opcode) { const uint32 ea = RA::get(this, opcode) + operand_RB::get(this, opcode); - regs.reserve_valid = 1; - regs.reserve_addr = ea; - regs.reserve_data = vm_read_memory_4(ea); - operand_RD::set(this, opcode, regs.reserve_data); + regs().reserve_valid = 1; + regs().reserve_addr = ea; + regs().reserve_data = vm_read_memory_4(ea); + operand_RD::set(this, opcode, regs().reserve_data); increment_pc(4); } @@ -747,13 +747,13 @@ void powerpc_cpu::execute_stwcx(uint32 opcode) { const uint32 ea = RA::get(this, opcode) + operand_RB::get(this, opcode); cr().clear(0); - if (regs.reserve_valid) { - if (regs.reserve_addr == ea /* physical_addr(EA) */ - && /* HACK */ regs.reserve_data == vm_read_memory_4(ea)) { + if (regs().reserve_valid) { + if (regs().reserve_addr == ea /* physical_addr(EA) */ + && /* HACK */ regs().reserve_data == vm_read_memory_4(ea)) { vm_write_memory_4(ea, operand_RS::get(this, opcode)); cr().set(0, standalone_CR_EQ_field::mask()); } - regs.reserve_valid = 0; + regs().reserve_valid = 0; } cr().set_so(0, xer().get_so()); increment_pc(4); diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp index 923d4474..e55f1fe0 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp @@ -210,13 +210,7 @@ union powerpc_vr uint32 w[4]; uint64 j[2]; float f[4]; -} -#if defined(__GNUC__) -// 16-byte alignment is required for SIMD optimizations operating on -// 128-bit aligned registers (e.g. SSE). -__attribute__((aligned(16))) -#endif -; +}; /** @@ -253,8 +247,8 @@ struct powerpc_registers powerpc_fpr fpr[32]; // Floating-Point Registers powerpc_fpr fp_result; // Floating-Point result powerpc_cr_register cr; // Condition Register - uint32 fpscr; // Floating-Point Status and Control Register powerpc_xer_register xer; // XER Register (SPR 1) + uint32 fpscr; // Floating-Point Status and Control Register uint32 lr; // Link Register (SPR 8) uint32 ctr; // Count Register (SPR 9) uint32 pc; // Program Counter @@ -262,6 +256,9 @@ struct powerpc_registers static uint32 reserve_valid; static uint32 reserve_addr; static uint32 reserve_data; +#define PPC_SZ(T) sizeof(powerpc_##T) + uint8 _pad[16 - ((PPC_SZ(fpr) + PPC_SZ(cr_register) + PPC_SZ(xer_register) + PPC_SZ(spcflags)) % 16)]; +#undef PPC_SZ powerpc_vr vr[32]; // Vector Registers powerpc_vscr vscr; // Vector Status and Control Register uint32 vrsave; // AltiVec Save Register diff --git a/SheepShaver/src/kpx_cpu/src/test/test-powerpc.cpp b/SheepShaver/src/kpx_cpu/src/test/test-powerpc.cpp index 597553a9..1c19bdb7 100644 --- a/SheepShaver/src/kpx_cpu/src/test/test-powerpc.cpp +++ b/SheepShaver/src/kpx_cpu/src/test/test-powerpc.cpp @@ -104,14 +104,6 @@ typedef uintptr_t uintptr; #define TEST_VMX_ARITH 1 -#if defined __GNUC__ -#define ALIGNED(N) __attribute__((aligned(N))) -#else -#if TEST_VMX_OPS -#error "AltiVec testing requires the align attribute" -#endif -#endif - // Partial PowerPC runtime assembler from GNU lightning #undef _I #define _I(X) ((uint32)(X)) @@ -506,7 +498,25 @@ typedef bit_field< 2, 2 > XER_CA_field; static bool has_altivec = true; // A 128-bit AltiVec register -typedef uint8 vector_t[16] ALIGNED(16); +typedef uint8 vector_t[16]; + +class aligned_vector_t { + struct { + vector_t v; + uint8 pad[16]; + } vs; +public: + aligned_vector_t() + { memset(addr(), 0, sizeof(vector_t)); } + aligned_vector_t(vector_t const & vi) + { memcpy(addr(), &vi, sizeof(vector_t)); } + vector_t *addr() const + { return (vector_t *)((char *)&vs.v + (16 - (((char *)&vs.v - (char *)this) % 16))); } + vector_t const & value() const + { return *addr(); } + vector_t & value() + { return *addr(); } +}; union vector_helper_t { vector_t v; @@ -721,7 +731,7 @@ private: struct vector_value_t { char type; - vector_t v ALIGNED(16); + vector_t v; }; static const uint32 reg_values[]; @@ -1611,24 +1621,25 @@ void powerpc_test_cpu::test_one_vector(uint32 *code, vector_test_t const & vt, u #endif // Invoke emulated code - static vector_t emul_vD; - memset(&emul_vD, 0, sizeof(emul_vD)); - static vector_helper_t emul_vSCR; - memset(&emul_vSCR, 0, sizeof(emul_vSCR)); - emul_vSCR.w[3] = 0; + static aligned_vector_t emul_vD; + memset(emul_vD.addr(), 0, sizeof(vector_t)); + static aligned_vector_t emul_vSCR; + memset(emul_vSCR.addr(), 0, sizeof(vector_t)); emul_set_cr(init_cr); - set_gpr(RD, (uintptr)&emul_vD); + set_gpr(RD, (uintptr)emul_vD.addr()); set_gpr(RA, (uintptr)rAp); set_gpr(RB, (uintptr)rBp); set_gpr(RC, (uintptr)rCp); - set_gpr(VSCR, (uintptr)emul_vSCR.b); + set_gpr(VSCR, (uintptr)emul_vSCR.addr()); execute(code); + vector_helper_t emul_vSCR_helper; + memcpy(&emul_vSCR_helper, emul_vSCR.addr(), sizeof(vector_t)); const uint32 emul_cr = emul_get_cr(); - const uint32 emul_vscr = ntohl(emul_vSCR.w[3]); + const uint32 emul_vscr = ntohl(emul_vSCR_helper.w[3]); ++tests; - bool ok = vector_equals(vt.type, native_vD, emul_vD) + bool ok = vector_equals(vt.type, native_vD, emul_vD.value()) && native_cr == emul_cr && native_vscr == emul_vscr; @@ -1676,7 +1687,7 @@ void powerpc_test_cpu::test_one_vector(uint32 *code, vector_test_t const & vt, u print_vector(native_vD, vt.type); printf("\n"); printf("vD.E = "); - print_vector(emul_vD, vt.type); + print_vector(emul_vD.value(), vt.type); printf("\n"); printf("CR.N = %08x ; VSCR.N = %08x\n", native_cr, native_vscr); printf("CR.E = %08x ; VSCR.E = %08x\n", emul_cr, emul_vscr); @@ -1764,7 +1775,6 @@ void powerpc_test_cpu::test_vector_load(void) } } assert(i_opcode != -1); - assert(((uintptr)&vector_values[0].v) % 16 == 0); const int n_elements = sizeof(tests) / sizeof(tests[0]); for (int i = 0; i < n_elements; i++) { @@ -1778,18 +1788,19 @@ void powerpc_test_cpu::test_vector_load(void) printf("Testing %s\n", vt.name); const int n_vector_values = sizeof(vector_values)/sizeof(vector_values[0]); for (int j = 0; j < n_vector_values; j++) { + aligned_vector_t av(vector_values[j].v); switch (vt.type) { case 'b': for (int k = 0; k < 16; k++) - test_one_vector(code, vt, ((uint8 *)&vector_values[j].v) + 1 * k); + test_one_vector(code, vt, ((uint8 *)av.addr()) + 1 * k); break; case 'h': for (int k = 0; k < 8; k++) - test_one_vector(code, vt, ((uint8 *)&vector_values[j].v) + 2 * k); + test_one_vector(code, vt, ((uint8 *)av.addr()) + 2 * k); break; case 'w': for (int k = 0; k < 4; k++) - test_one_vector(code, vt, ((uint8 *)&vector_values[j].v) + 4 * k); + test_one_vector(code, vt, ((uint8 *)av.addr()) + 4 * k); break; } } @@ -2012,29 +2023,40 @@ void powerpc_test_cpu::test_vector_arith(void) printf("Testing %s\n", vt.name); if (vt.operands[1] == vA && vt.operands[2] == vB && vt.operands[3] == vC) { - for (int i = 0; i < n_vector_values; i++) - for (int j = 0; j < n_vector_values; j++) - for (int k = 0; k < n_vector_values; k++) - test_one_vector(code, vt, &vvp[i].v, &vvp[j].v, &vvp[k].v); + for (int i = 0; i < n_vector_values; i++) { + aligned_vector_t avi(vvp[i].v); + for (int j = 0; j < n_vector_values; j++) { + aligned_vector_t avj(vvp[j].v); + for (int k = 0; k < n_vector_values; k++) { + aligned_vector_t avk(vvp[k].v); + test_one_vector(code, vt, avi.addr(), avj.addr(), avk.addr()); + } + } + } } else if (vt.operands[1] == vA && vt.operands[2] == vB && vt.operands[3] == vN) { for (int i = 0; i < 16; i++) { vSH_field::insert(vt.opcode, i); code[i_opcode] = vt.opcode; flush_icache_range(code, sizeof(code)); - for (int j = 0; j < n_vector_values; j++) + aligned_vector_t avi(vvp[i].v); + for (int j = 0; j < n_vector_values; j++) { + aligned_vector_t avj(vvp[j].v); for (int k = 0; k < n_vector_values; k++) - test_one_vector(code, vt, &vvp[i].v, &vvp[j].v); + test_one_vector(code, vt, avi.addr(), avj.addr()); + } } } else if (vt.operands[1] == vA && vt.operands[2] == vB) { for (int i = 0; i < n_vector_values; i++) { + aligned_vector_t avi(vvp[i].v); for (int j = 0; j < n_vector_values; j++) { if (op_type == 'B') { if (!vector_all_eq('b', vvp[j].v)) continue; } - test_one_vector(code, vt, &vvp[i].v, &vvp[j].v); + aligned_vector_t avj(vvp[j].v); + test_one_vector(code, vt, avi.addr(), avj.addr()); } } } @@ -2043,8 +2065,10 @@ void powerpc_test_cpu::test_vector_arith(void) rA_field::insert(vt.opcode, i); code[i_opcode] = vt.opcode; flush_icache_range(code, sizeof(code)); - for (int j = 0; j < n_vector_values; j++) - test_one_vector(code, vt, NULL, &vvp[j].v); + for (int j = 0; j < n_vector_values; j++) { + aligned_vector_t avj(vvp[j].v); + test_one_vector(code, vt, NULL, avj.addr()); + } } } else if (vt.operands[1] == vI) { @@ -2056,8 +2080,10 @@ void powerpc_test_cpu::test_vector_arith(void) } } else if (vt.operands[1] == __ && vt.operands[2] == vB) { - for (int i = 0; i < n_vector_values; i++) - test_one_vector(code, vt, NULL, &vvp[i].v); + for (int i = 0; i < n_vector_values; i++) { + aligned_vector_t avi(vvp[i].v); + test_one_vector(code, vt, NULL, avi.addr()); + } } else { printf("ERROR: unhandled test case\n");