From b3bd00f159bb5614176d8219a0dfbe0a656defb1 Mon Sep 17 00:00:00 2001
From: gbeauche <>
Date: Mon, 1 Nov 2004 17:12:55 +0000
Subject: [PATCH] - affine need_to_preserve[] to get close to linux/x86_64 ABI
 - optimize NOP fillers on x86-64 (based on GNU as implementation)

---
 .../src/uae_cpu/compiler/codegen_x86.cpp      | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/BasiliskII/src/uae_cpu/compiler/codegen_x86.cpp b/BasiliskII/src/uae_cpu/compiler/codegen_x86.cpp
index 361c8047..e4317d01 100644
--- a/BasiliskII/src/uae_cpu/compiler/codegen_x86.cpp
+++ b/BasiliskII/src/uae_cpu/compiler/codegen_x86.cpp
@@ -107,7 +107,12 @@ uae_u8 call_saved[]={0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0};
    - Special registers (such like the stack pointer) should not be "preserved"
      by pushing, even though they are "saved" across function calls
 */
-uae_u8 need_to_preserve[]={1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1};
+#if defined(__x86_64__)
+/* callee-saved registers as defined by Linux/x86_64 ABI: rbx, rbp, rsp, r12 - r15 */
+static const uae_u8 need_to_preserve[]={0,0,0,1,0,1,0,0,0,0,0,0,1,1,1,1};
+#else
+static const uae_u8 need_to_preserve[]={1,1,1,1,0,1,1,1};
+#endif
 
 /* Whether classes of instructions do or don't clobber the native flags */
 #define CLOBBER_MOV
@@ -3139,7 +3144,28 @@ static __inline__ void raw_emit_nop_filler(int nbytes)
     f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
     f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
   };
+  static const uae_u8 prefixes[4] = { 0x66, 0x66, 0x66, 0x66 };
 
+#if defined(__x86_64__)
+  /* The recommended way to pad 64bit code is to use NOPs preceded by
+     maximally four 0x66 prefixes.  Balance the size of nops.  */
+  if (nbytes == 0)
+	  return;
+
+  int i;
+  int nnops = (nbytes + 3) / 4;
+  int len = nbytes / nnops;
+  int remains = nbytes - nnops * len;
+
+  for (i = 0; i < remains; i++) {
+	  emit_block(prefixes, len);
+	  raw_nop();
+  }
+  for (; i < nnops; i++) {
+	  emit_block(prefixes, len - 1);
+	  raw_nop();
+  }
+#else
   int nloops = nbytes / 16;
   while (nloops-- > 0)
 	emit_block(f32_16, sizeof(f32_16));
@@ -3147,6 +3173,7 @@ static __inline__ void raw_emit_nop_filler(int nbytes)
   nbytes %= 16;
   if (nbytes)
 	emit_block(f32_patt[nbytes - 1], nbytes);
+#endif
 }