diff --git a/media/libpng/moz.build b/media/libpng/moz.build index b41a692af..d8e0a9468 100644 --- a/media/libpng/moz.build +++ b/media/libpng/moz.build @@ -48,11 +48,11 @@ if CONFIG['INTEL_ARCHITECTURE']: 'intel/intel_init.c' ] -if CONFIG['HAVE_ALTIVEC_NOT_ENABLED_YET']: - DEFINES['MOZ_PNG_USE_POWERPC'] = True +if CONFIG['TENFOURFOX_VMX']: + DEFINES['MOZ_PNG_USE_ALTIVEC'] = True UNIFIED_SOURCES += [ - 'powerpc/filter_vsx_intrinsics.c', - 'powerpc/powerpc_init.c' + 'powerpc/filter_vmx_intrinsics.c', + 'powerpc/powerpc_vmx_init.c' ] if CONFIG['MOZ_TREE_FREETYPE']: diff --git a/media/libpng/pnglibconf.h b/media/libpng/pnglibconf.h index 618ec28c5..e678a1b7d 100644 --- a/media/libpng/pnglibconf.h +++ b/media/libpng/pnglibconf.h @@ -69,10 +69,10 @@ # define PNG_INTEL_SSE_OPT 0 #endif -#ifdef MOZ_PNG_USE_POWERPC -# undef PNG_POWERPC_VSX_OPT /* Let libpng decide */ +#ifdef MOZ_PNG_USE_ALTIVEC +# undef PNG_POWERPC_VMX_OPT /* Let libpng decide */ #else -# define PNG_POWERPC_VSX_OPT 0 /* Do not use VSX optimization */ +# define PNG_POWERPC_VMX_OPT 0 /* Do not use VSX optimization */ #endif #define PNG_READ_SUPPORTED diff --git a/media/libpng/pngpriv.h b/media/libpng/pngpriv.h index 2262c0671..61abb37bf 100644 --- a/media/libpng/pngpriv.h +++ b/media/libpng/pngpriv.h @@ -200,6 +200,14 @@ # endif #endif +#ifndef PNG_POWERPC_VMX_OPT +# if defined(__ALTIVEC__) +# define PNG_POWERPC_VMX_OPT 2 +# else +# define PNG_POWERPC_VMX_OPT 0 +# endif +#endif + #ifndef PNG_INTEL_SSE_OPT # ifdef PNG_INTEL_SSE /* Only check for SSE if the build configuration has been modified to @@ -261,6 +269,10 @@ # define PNG_POWERPC_VSX_IMPLEMENTATION 1 #endif +#if PNG_POWERPC_VMX_OPT > 0 +# define PNG_FILTER_OPTIMIZATIONS png_init_filter_functions_vmx +# define PNG_POWERPC_VMX_IMPLEMENTATION 1 +#endif /* Is this a build of a DLL where compilation of the object modules requires * different preprocessor settings to those required for a simple library? If @@ -1354,6 +1366,24 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_vsx,(png_row_infop row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); #endif +#if PNG_POWERPC_VMX_OPT > 0 +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_up_vmx,(png_row_infop row_info, + png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_vmx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub4_vmx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg3_vmx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_avg4_vmx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_vmx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_vmx,(png_row_infop + row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); +#endif + + #if PNG_INTEL_SSE_IMPLEMENTATION > 0 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_sub3_sse2,(png_row_infop row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); diff --git a/media/libpng/powerpc/filter_vmx_intrinsics.c b/media/libpng/powerpc/filter_vmx_intrinsics.c new file mode 100644 index 000000000..f0e553c30 --- /dev/null +++ b/media/libpng/powerpc/filter_vmx_intrinsics.c @@ -0,0 +1,763 @@ +/* filter_vmx_intrinsics.c - PowerPC optimised filter functions + * for original AltiVec/AMX + * + * Copyright (c) 2017 Glenn Randers-Pehrson + * Written by Cameron Kaiser for TenFourFox + * Based on filter_vmx_intrinsics for POWER7 by Vadim Barkov, 2017. + * Last changed in libpng 1.6.29 [March 16, 2017] + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -maltivec on the command line: */ +#if PNG_POWERPC_VMX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +#if PNG_POWERPC_VMX_OPT > 0 + +#ifndef __ALTIVEC__ +# error "This code requires AltiVec/VMX. Please provide -maltivec compiler flag." +#endif + +#define vec_ld_unaligned(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); } +/* XXX unused? */ +#if(0) +inline void vec_st_unaligned(vector unsigned char src, unsigned char *data) { + vector unsigned char MSQ, LSQ, result; + vector unsigned char mask, align, zero, neg1; + + MSQ = vec_ld(0, target); + LSQ = vec_ld(16, target); + align = vec_lvsr(0, target); + zero = vec_splat_u8(0); + neg1 = vec_splat_s8(-1); + mask = vec_perm(zero, neg1, align); + src = vec_perm(src, src, align); + MSQ = vec_sel(MSQ, src, mask); + LSQ = vec_sel(src, LSQ, mask); + + vec_st(MSQ, 0, target); + vec_st(LSQ, 16, target); +} +#endif + +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * Given the following element positions, + * prev: c b + * row: a d + * the Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + * ( this is taken from ../intel/filter_sse2_intrinsics.c ) + */ + +#define vmx_declare_common_vars(row_info,row,prev_row,offset) \ + png_byte i;\ + png_bytep rp = row + offset;\ + png_const_bytep pp = prev_row;\ + png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\ + png_size_t istop;\ + if(unaligned_top == 16)\ + unaligned_top = 0;\ + istop = row_info->rowbytes;\ + if((unaligned_top < istop))\ + istop -= unaligned_top;\ + else{\ + unaligned_top = istop;\ + istop = 0;\ + } + +void png_read_filter_row_up_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vmx_declare_common_vars(row_info,row,prev_row,0) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + rp_vec = vec_add(rp_vec,pp_vec); + + vec_st(rp_vec,0,rp); + + pp += 16; + rp += 16; + istop -= 16; + } + + if(istop > 0) + { + /* If byte count of row is not divisible by 16 + * we will process remaining part as usual + */ + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + } +} + +static const vector unsigned char VMX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16}; +static const vector unsigned char VMX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11}; + +static const vector unsigned char VMX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16}; +static const vector unsigned char VMX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16}; + +static const vector unsigned char VMX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16}; +static const vector unsigned char VMX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15}; + +static const vector unsigned char VMX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16}; +static const vector unsigned char VMX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16}; + +static const vector unsigned char VMX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +static const vector unsigned char VMX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VMX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16}; +static const vector unsigned char VMX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7}; + +static const vector unsigned char VMX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VMX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VMX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16}; +static const vector unsigned char VMX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16}; + +#define vmx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VMX_CHAR_ZERO,VMX_CHAR_TO_SHORT##offset##_##bpp) +#define vmx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VMX_CHAR_ZERO,VMX_SHORT_TO_CHAR##offset##_##bpp) + +#ifdef PNG_USE_ABS +# define vmx_abs(number) abs(number) +#else +# define vmx_abs(number) (number > 0) ? (number) : -(number) +#endif + +void png_read_filter_row_sub4_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vmx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED1_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED2_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED3_4); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff); + rp++; + } + +} + +void png_read_filter_row_sub3_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vmx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED1_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED2_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED3_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED4_3); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + rp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_avg4_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vmx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED1_4); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED1_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED2_4); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED2_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED3_4); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED3_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +void png_read_filter_row_avg3_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vmx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED1_3); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED1_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED2_3); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED2_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED3_3); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED3_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VMX_CHAR_ZERO,VMX_LEFTSHIFTED4_3); + pp_part_vec = vec_perm(pp_vec,VMX_CHAR_ZERO,VMX_NOT_SHIFTED4_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +/* Bytewise c ? t : e. */ +#define if_then_else(c,t,e) vec_sel(e,t,c) + +#define vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\ + c = *(pp - bpp);\ + a = *(rp - bpp);\ + b = *pp++;\ + p = b - c;\ + pc = a - c;\ + pa = vmx_abs(p);\ + pb = vmx_abs(pc);\ + pc = vmx_abs(p + pc);\ + if (pb < pa) pa = pb, a = b;\ + if (pc < pa) a = c;\ + a += *rp;\ + *rp++ = (png_byte)a;\ + } + +void png_read_filter_row_paeth4_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 4; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vmx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED1_4),1,4); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED1_4),1,4); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED1_4),1,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,1,4))); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED2_4),2,4); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED2_4),2,4); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED2_4),2,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,2,4))); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED3_4),3,4); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED3_4),3,4); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED3_4),3,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,3,4))); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +void png_read_filter_row_paeth3_vmx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + const png_byte bpp = 3; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vmx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED1_3),1,3); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED1_3),1,3); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED1_3),1,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,1,3))); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED2_3),2,3); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED2_3),2,3); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED2_3),2,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,2,3))); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED3_3),3,3); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED3_3),3,3); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED3_3),3,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,3,3))); + + a_vec = vmx_char_to_short(vec_perm(rp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED4_3),4,3); + b_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_NOT_SHIFTED4_3),4,3); + c_vec = vmx_char_to_short(vec_perm(pp_vec , VMX_CHAR_ZERO , VMX_LEFTSHIFTED4_3),4,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vmx_short_to_char(nearest_vec,4,3))); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vmx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +#endif /* PNG_POWERPC_VMX_OPT > 0 */ +#endif /* PNG_POWERPC_VMX_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/media/libpng/powerpc/powerpc_vmx_init.c b/media/libpng/powerpc/powerpc_vmx_init.c new file mode 100644 index 000000000..7ea51bfc4 --- /dev/null +++ b/media/libpng/powerpc/powerpc_vmx_init.c @@ -0,0 +1,50 @@ + +/* powerpc_vmx_init.c - PowerPC optimised filter functions + * for original AltiVec/VMX + * + * This is a simple stub dispatch for TenFourFox; it isn't + * intended to be upstreamed. + * Copyright 2017-8 Cameron Kaiser and Contributors to TenFourFox. + * All rights reserved. */ + +#ifdef PNG_READ_SUPPORTED +#if PNG_POWERPC_VMX_OPT > 0 + +void +png_init_filter_functions_vmx(png_structp pp, unsigned int bpp) +{ + /* If VMX is turned on, then we enable at compile time. + moz.build sets PNG_POWERPC_VMX_IMPLEMENTATION and + PNG_POWERPC_VMX_OPT based on TENFOURFOX_VMX. */ + + /* IMPORTANT: any new internal functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_vmx; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_vmx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_vmx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_vmx; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_vmx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_vmx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_vmx; + } +} + +#endif +#endif +