From d7cd088c2b43235ac35c4238d80d122ee03ece9d Mon Sep 17 00:00:00 2001
From: Cameron Kaiser <classilla@floodgap.com>
Date: Sat, 27 Jan 2018 13:04:49 -0800
Subject: [PATCH] #470: clean up row filters, collapse into permute

---
 gfx/ycbcr/yuv_convert_ppc.cpp | 55 ++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/gfx/ycbcr/yuv_convert_ppc.cpp b/gfx/ycbcr/yuv_convert_ppc.cpp
index 6165261b3..979e4d6c1 100644
--- a/gfx/ycbcr/yuv_convert_ppc.cpp
+++ b/gfx/ycbcr/yuv_convert_ppc.cpp
@@ -15,42 +15,49 @@ namespace gfx {
 // VMX version does 16 pixels at a time.
 void FilterRows_VMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
                      int source_width, int source_y_fraction) {
-  // splat the multiplicands. AltiVec makes this unnecessarily difficult.
+  register vector unsigned short vector_zero = vec_splat_u16(0);
+  register vector unsigned char r0, c0, c1;
+  register vector unsigned short y0, y1, y2, y3;
+
+  uint8 *end = ybuf + source_width;
+  
+  // Although you'd think using a version with vec_avg for 50% would
+  // be profitable to write, in practice it doesn't seem to be used
+  // much if at all, so this doesn't implement one.
+
+  // Splat the multiplicands. AltiVec makes this unnecessarily difficult.
   unsigned short __attribute__ ((aligned(16))) syf = source_y_fraction;
   unsigned short __attribute__ ((aligned(16))) syf2 = (256 - source_y_fraction);
   register vector unsigned short y1_fraction = vec_lde(0, &syf);
   y1_fraction = vec_splat(y1_fraction, 0);
   register vector unsigned short y0_fraction = vec_lde(0, &syf2);
   y0_fraction = vec_splat(y0_fraction, 0);
-
-  register vector unsigned short vector_eight = vec_splat_u16(8);
-  register vector unsigned short vector_zero = vec_splat_u16(0);
-  register vector unsigned char vector_c_zero = vec_splat_u8(0);
-
-  uint8 *end = ybuf + source_width;
+  
+  // Permute vector for combining shift and pack in one operation.
+  // This effectively shifts each vector down by 8 bits and packs.
+  register vector unsigned char vector_sh8pak =
+    { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 };
 
   // Compute a weighted average.
   do {
-    vector unsigned char r0;
-    vector unsigned char c0 = vec_ld(0, y0_ptr);
-    vector unsigned char c1 = vec_ld(0, y1_ptr);
+    c0 = vec_ld(0, y0_ptr);
+    c1 = vec_ld(0, y1_ptr);
 
-        // another VMX annoyance: unpackh/l are SIGNED. bastard Motorola.
-        register vector unsigned short y0 = vec_mergeh(vector_c_zero, c0);
-        register vector unsigned short y1 = vec_mergeh(vector_c_zero, c1);
-        register vector unsigned short y2 = vec_mergel(vector_c_zero, c0);
-        register vector unsigned short y3 = vec_mergel(vector_c_zero, c1);
+    // Expand to short, since vec_mladd does not exist for char (damn).
+    y0 = vec_mergeh((vector unsigned char)vector_zero, c0);
+    y1 = vec_mergeh((vector unsigned char)vector_zero, c1);
+    y2 = vec_mergel((vector unsigned char)vector_zero, c0);
+    y3 = vec_mergel((vector unsigned char)vector_zero, c1);
 
-        // FUSED MULTIPLY ADD, BEYOTCHES! INTEL SUX!
-        y1 = vec_mladd(y1, y1_fraction, vector_zero);
-        y0 = vec_mladd(y0, y0_fraction, y1);
-        y0 = vec_sr(y0, vector_eight);
+    // FUSED MULTIPLY ADD, BEYOTCHES! INTEL SUX!
+    // Interleave the operations.
+    y1 = vec_mladd(y1, y1_fraction, vector_zero);
+    y3 = vec_mladd(y3, y1_fraction, vector_zero);
+    y0 = vec_mladd(y0, y0_fraction, y1);
+    y2 = vec_mladd(y2, y0_fraction, y3);
 
-        y3 = vec_mladd(y3, y1_fraction, vector_zero);
-        y2 = vec_mladd(y2, y0_fraction, y3);
-        y2 = vec_sr(y2, vector_eight);
-
-        r0 = vec_pack(y0, y2);
+    // Turn vec_sr on y0/y2 and a vec_pack into a single op.
+    r0 = vec_perm((vector unsigned char)y0, (vector unsigned char)y2, vector_sh8pak);
 
     vec_st(r0, 0, (unsigned char *)ybuf);
     ybuf += 16;