#324: vp8 intrinsics for future expansion

2017-11-30 22:05:32 -08:00 · 2017-11-30 22:05:32 -08:00 · 515d20df86
parent 8247519187
commit 515d20df86
1 changed files with 1 additions and 0 deletions
--- a/media/libvpx/vp8/common/ppc/vp8_intrinsics_vmx.c
+++ b/media/libvpx/vp8/common/ppc/vp8_intrinsics_vmx.c
@ -0,0 +1 @@
+/* Copyright 2018 Cameron Kaiser and Contributors to TenFourFox.
   All rights reserved. */

#include <stddef.h>
#include "./vp8_rtcd.h"

#ifndef __ALTIVEC__
#error VMX being compiled on non-VMX platform
#else
#include <altivec.h>
#endif

/* Notes:

        movdqa xmm0, [rdi]    // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1)
        movdqa xmm1, xmm0     // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1), same
        pxor xmm2, xmm2       // (raw 0x0)

        punpcklbw xmm0, xmm2  // converts low eight bytes to shorts
                              // (raw 0x0000ff00fe00fd00fc00fb00fa00f900)
                              // if with itself:
                              // (raw 0x0000fffffefefdfdfcfcfbfbfafaf9f9)
        punpckhbw xmm1, xmm2  // converts high eight bytes to shorts
                              // (raw 0xf800f700f600f500f400f300f200f100)
                              // if with itself:
                              // (raw 0xf8f8f7f7f6f6f5f5f4f4f3f3f2f2f1f1)
                              
        mov rax, 0x7a         // (raw 0x7a000000000000000000000000000000)
        movd xmm0, rax
        pshuflw     xmm0, xmm0, 0x0
                              // (raw 0x7a007a007a007a000000000000000000)
        punpcklqdq  xmm0, xmm0
                              // (raw 0x7a007a007a007a007a007a007a007a00)

*/

// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)

// Basic notion. Guaranteed to work at any offset.
#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }

// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However,
// this may have issues loading at really weird addresses if they're not
// minimally word-aligned.
#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
// Equivalent for _mm_cvtsi128_si32.
#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
// Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now.
#define _unaligned_load64(v,s) _unaligned_load128(v,s)
// Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements.
#define _unaligned_store64(v,vv,s) {\
	vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\
	vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
}

/* XXX: not currently used; CONFIG_POSTPROC=0 */
void vp8_filter_by_weight16x16_vmx(unsigned char *src,
                                   int src_stride,
                                   unsigned char *dst,
                                   int dst_stride,
                                   int src_weight)
{
	short sw = (short)src_weight;
	vector short tmfqe_r = vec_splat_u16(8);
	vector unsigned short vfour = vec_splat_u16(4);
	vector short s0, s1, s2, s2h, s2l, s3, s3h, s3l, s4, s4h, s4l, s5, s5h, s5l, s6;
	vector unsigned char c0, c1, c2, c3;
	vector short v1 = vec_add(tmfqe_r, tmfqe_r);
	vector short v0 = { sw, sw, sw, sw, sw, sw, sw, sw };
	vector short vzero = vec_splat_u16(0);
	ptrdiff_t src_offs = 0;
	ptrdiff_t dst_offs = 0;
	
	// v0 = src_weight, s1 = dst_weight
	s1 = vec_sub(v1, v0);

#define COMBINE \
	c2  = vec_ld(src_offs, src); \
	c3  = vec_ld(dst_offs, dst); \
	s2h = vec_mergeh((vector unsigned char)vzero, c2); /* src *= src_weight, dst *= dst_weight */ \
	s3h = vec_mergeh((vector unsigned char)vzero, c3); \
	s2l = vec_mergel((vector unsigned char)vzero, c2); \
	s3l = vec_mergel((vector unsigned char)vzero, c3); \
	s4h = vec_madds(s2h, v0, vzero); \
	s5h = vec_madds(s3h, s1, vzero); \
	s4l = vec_madds(s2l, v0, vzero); \
	s5l = vec_madds(s2l, s1, vzero); \
	s2  = vec_add(s2h, s3h); \
	s3  = vec_add(s2l, s3l); \
	s4  = vec_add(s2, tmfqe_r); \
	s5  = vec_add(s3, tmfqe_r); \
	s2  = vec_sr(s4, vfour); \
	s3  = vec_sr(s5, vfour); \
	\
	c0  = vec_packsu(s2, s3); \
	vec_st(c0, dst_offs, dst); \
	src_offs += src_stride; \
	dst_offs += dst_stride;
	
	COMBINE COMBINE COMBINE COMBINE
	COMBINE COMBINE COMBINE COMBINE
	COMBINE COMBINE COMBINE COMBINE
	COMBINE COMBINE COMBINE COMBINE
	
}

/* NYI */
void vp8_variance_and_sad_16x16_vmx(unsigned char *src1,
                                    int stride1,
                                    unsigned char *src2,
                                    int stride2,
                                    unsigned int *variance,
                                    unsigned int *sad)
{
	vector short t128 = vec_sl(vec_splat_u16(8), vec_splat_u16(4));
}
				`@ -0,0 +1 @@`
				/* Copyright 2018 Cameron Kaiser and Contributors to TenFourFox. All rights reserved. / #include <stddef.h> #include "./vp8_rtcd.h" #ifndef __ALTIVEC__ #error VMX being compiled on non-VMX platform #else #include <altivec.h> #endif / Notes: movdqa xmm0, [rdi] // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1) movdqa xmm1, xmm0 // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1), same pxor xmm2, xmm2 // (raw 0x0) punpcklbw xmm0, xmm2 // converts low eight bytes to shorts // (raw 0x0000ff00fe00fd00fc00fb00fa00f900) // if with itself: // (raw 0x0000fffffefefdfdfcfcfbfbfafaf9f9) punpckhbw xmm1, xmm2 // converts high eight bytes to shorts // (raw 0xf800f700f600f500f400f300f200f100) // if with itself: // (raw 0xf8f8f7f7f6f6f5f5f4f4f3f3f2f2f1f1) mov rax, 0x7a // (raw 0x7a000000000000000000000000000000) movd xmm0, rax pshuflw xmm0, xmm0, 0x0 // (raw 0x7a007a007a007a000000000000000000) punpcklqdq xmm0, xmm0 // (raw 0x7a007a007a007a007a007a007a007a00) / // v = vector, s = uint32_t, vv = temporary vector (unsigned int) // Basic notion. Guaranteed to work at any offset. #define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); } // Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However, // this may have issues loading at really weird addresses if they're not // minimally word-aligned. #define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); } // Equivalent for _mm_cvtsi128_si32. #define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); } // Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now. #define _unaligned_load64(v,s) _unaligned_load128(v,s) // Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements. #define _unaligned_store64(v,vv,s) {\ vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\ vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\ } /* XXX: not currently used; CONFIG_POSTPROC=0 / void vp8_filter_by_weight16x16_vmx(unsigned char src, int src_stride, unsigned char dst, int dst_stride, int src_weight) { short sw = (short)src_weight; vector short tmfqe_r = vec_splat_u16(8); vector unsigned short vfour = vec_splat_u16(4); vector short s0, s1, s2, s2h, s2l, s3, s3h, s3l, s4, s4h, s4l, s5, s5h, s5l, s6; vector unsigned char c0, c1, c2, c3; vector short v1 = vec_add(tmfqe_r, tmfqe_r); vector short v0 = { sw, sw, sw, sw, sw, sw, sw, sw }; vector short vzero = vec_splat_u16(0); ptrdiff_t src_offs = 0; ptrdiff_t dst_offs = 0; // v0 = src_weight, s1 = dst_weight s1 = vec_sub(v1, v0); #define COMBINE \ c2 = vec_ld(src_offs, src); \ c3 = vec_ld(dst_offs, dst); \ s2h = vec_mergeh((vector unsigned char)vzero, c2); / src = src_weight, dst = dst_weight / \ s3h = vec_mergeh((vector unsigned char)vzero, c3); \ s2l = vec_mergel((vector unsigned char)vzero, c2); \ s3l = vec_mergel((vector unsigned char)vzero, c3); \ s4h = vec_madds(s2h, v0, vzero); \ s5h = vec_madds(s3h, s1, vzero); \ s4l = vec_madds(s2l, v0, vzero); \ s5l = vec_madds(s2l, s1, vzero); \ s2 = vec_add(s2h, s3h); \ s3 = vec_add(s2l, s3l); \ s4 = vec_add(s2, tmfqe_r); \ s5 = vec_add(s3, tmfqe_r); \ s2 = vec_sr(s4, vfour); \ s3 = vec_sr(s5, vfour); \ \ c0 = vec_packsu(s2, s3); \ vec_st(c0, dst_offs, dst); \ src_offs += src_stride; \ dst_offs += dst_stride; COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE } / NYI / void vp8_variance_and_sad_16x16_vmx(unsigned char src1, int stride1, unsigned char src2, int stride2, unsigned int variance, unsigned int *sad) { vector short t128 = vec_sl(vec_splat_u16(8), vec_splat_u16(4)); }