From d7cd088c2b43235ac35c4238d80d122ee03ece9d Mon Sep 17 00:00:00 2001 From: Cameron Kaiser Date: Sat, 27 Jan 2018 13:04:49 -0800 Subject: [PATCH] #470: clean up row filters, collapse into permute --- gfx/ycbcr/yuv_convert_ppc.cpp | 55 ++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/gfx/ycbcr/yuv_convert_ppc.cpp b/gfx/ycbcr/yuv_convert_ppc.cpp index 6165261b3..979e4d6c1 100644 --- a/gfx/ycbcr/yuv_convert_ppc.cpp +++ b/gfx/ycbcr/yuv_convert_ppc.cpp @@ -15,42 +15,49 @@ namespace gfx { // VMX version does 16 pixels at a time. void FilterRows_VMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, int source_width, int source_y_fraction) { - // splat the multiplicands. AltiVec makes this unnecessarily difficult. + register vector unsigned short vector_zero = vec_splat_u16(0); + register vector unsigned char r0, c0, c1; + register vector unsigned short y0, y1, y2, y3; + + uint8 *end = ybuf + source_width; + + // Although you'd think using a version with vec_avg for 50% would + // be profitable to write, in practice it doesn't seem to be used + // much if at all, so this doesn't implement one. + + // Splat the multiplicands. AltiVec makes this unnecessarily difficult. unsigned short __attribute__ ((aligned(16))) syf = source_y_fraction; unsigned short __attribute__ ((aligned(16))) syf2 = (256 - source_y_fraction); register vector unsigned short y1_fraction = vec_lde(0, &syf); y1_fraction = vec_splat(y1_fraction, 0); register vector unsigned short y0_fraction = vec_lde(0, &syf2); y0_fraction = vec_splat(y0_fraction, 0); - - register vector unsigned short vector_eight = vec_splat_u16(8); - register vector unsigned short vector_zero = vec_splat_u16(0); - register vector unsigned char vector_c_zero = vec_splat_u8(0); - - uint8 *end = ybuf + source_width; + + // Permute vector for combining shift and pack in one operation. + // This effectively shifts each vector down by 8 bits and packs. + register vector unsigned char vector_sh8pak = + { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; // Compute a weighted average. do { - vector unsigned char r0; - vector unsigned char c0 = vec_ld(0, y0_ptr); - vector unsigned char c1 = vec_ld(0, y1_ptr); + c0 = vec_ld(0, y0_ptr); + c1 = vec_ld(0, y1_ptr); - // another VMX annoyance: unpackh/l are SIGNED. bastard Motorola. - register vector unsigned short y0 = vec_mergeh(vector_c_zero, c0); - register vector unsigned short y1 = vec_mergeh(vector_c_zero, c1); - register vector unsigned short y2 = vec_mergel(vector_c_zero, c0); - register vector unsigned short y3 = vec_mergel(vector_c_zero, c1); + // Expand to short, since vec_mladd does not exist for char (damn). + y0 = vec_mergeh((vector unsigned char)vector_zero, c0); + y1 = vec_mergeh((vector unsigned char)vector_zero, c1); + y2 = vec_mergel((vector unsigned char)vector_zero, c0); + y3 = vec_mergel((vector unsigned char)vector_zero, c1); - // FUSED MULTIPLY ADD, BEYOTCHES! INTEL SUX! - y1 = vec_mladd(y1, y1_fraction, vector_zero); - y0 = vec_mladd(y0, y0_fraction, y1); - y0 = vec_sr(y0, vector_eight); + // FUSED MULTIPLY ADD, BEYOTCHES! INTEL SUX! + // Interleave the operations. + y1 = vec_mladd(y1, y1_fraction, vector_zero); + y3 = vec_mladd(y3, y1_fraction, vector_zero); + y0 = vec_mladd(y0, y0_fraction, y1); + y2 = vec_mladd(y2, y0_fraction, y3); - y3 = vec_mladd(y3, y1_fraction, vector_zero); - y2 = vec_mladd(y2, y0_fraction, y3); - y2 = vec_sr(y2, vector_eight); - - r0 = vec_pack(y0, y2); + // Turn vec_sr on y0/y2 and a vec_pack into a single op. + r0 = vec_perm((vector unsigned char)y0, (vector unsigned char)y2, vector_sh8pak); vec_st(r0, 0, (unsigned char *)ybuf); ybuf += 16;