#324: vp8 intrinsics for future expansion

This commit is contained in:
Cameron Kaiser 2017-11-30 22:05:32 -08:00
parent 8247519187
commit 515d20df86
1 changed files with 1 additions and 0 deletions

View File

@ -0,0 +1 @@
/* Copyright 2018 Cameron Kaiser and Contributors to TenFourFox. All rights reserved. */ #include <stddef.h> #include "./vp8_rtcd.h" #ifndef __ALTIVEC__ #error VMX being compiled on non-VMX platform #else #include <altivec.h> #endif /* Notes: movdqa xmm0, [rdi] // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1) movdqa xmm1, xmm0 // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1), same pxor xmm2, xmm2 // (raw 0x0) punpcklbw xmm0, xmm2 // converts low eight bytes to shorts // (raw 0x0000ff00fe00fd00fc00fb00fa00f900) // if with itself: // (raw 0x0000fffffefefdfdfcfcfbfbfafaf9f9) punpckhbw xmm1, xmm2 // converts high eight bytes to shorts // (raw 0xf800f700f600f500f400f300f200f100) // if with itself: // (raw 0xf8f8f7f7f6f6f5f5f4f4f3f3f2f2f1f1) mov rax, 0x7a // (raw 0x7a000000000000000000000000000000) movd xmm0, rax pshuflw xmm0, xmm0, 0x0 // (raw 0x7a007a007a007a000000000000000000) punpcklqdq xmm0, xmm0 // (raw 0x7a007a007a007a007a007a007a007a00) */ // v = vector, s = *uint32_t, vv = temporary vector (unsigned int) // Basic notion. Guaranteed to work at any offset. #define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); } // Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However, // this may have issues loading at really weird addresses if they're not // minimally word-aligned. #define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); } // Equivalent for _mm_cvtsi128_si32. #define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); } // Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now. #define _unaligned_load64(v,s) _unaligned_load128(v,s) // Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements. #define _unaligned_store64(v,vv,s) {\ vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\ vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\ } /* XXX: not currently used; CONFIG_POSTPROC=0 */ void vp8_filter_by_weight16x16_vmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight) { short sw = (short)src_weight; vector short tmfqe_r = vec_splat_u16(8); vector unsigned short vfour = vec_splat_u16(4); vector short s0, s1, s2, s2h, s2l, s3, s3h, s3l, s4, s4h, s4l, s5, s5h, s5l, s6; vector unsigned char c0, c1, c2, c3; vector short v1 = vec_add(tmfqe_r, tmfqe_r); vector short v0 = { sw, sw, sw, sw, sw, sw, sw, sw }; vector short vzero = vec_splat_u16(0); ptrdiff_t src_offs = 0; ptrdiff_t dst_offs = 0; // v0 = src_weight, s1 = dst_weight s1 = vec_sub(v1, v0); #define COMBINE \ c2 = vec_ld(src_offs, src); \ c3 = vec_ld(dst_offs, dst); \ s2h = vec_mergeh((vector unsigned char)vzero, c2); /* src *= src_weight, dst *= dst_weight */ \ s3h = vec_mergeh((vector unsigned char)vzero, c3); \ s2l = vec_mergel((vector unsigned char)vzero, c2); \ s3l = vec_mergel((vector unsigned char)vzero, c3); \ s4h = vec_madds(s2h, v0, vzero); \ s5h = vec_madds(s3h, s1, vzero); \ s4l = vec_madds(s2l, v0, vzero); \ s5l = vec_madds(s2l, s1, vzero); \ s2 = vec_add(s2h, s3h); \ s3 = vec_add(s2l, s3l); \ s4 = vec_add(s2, tmfqe_r); \ s5 = vec_add(s3, tmfqe_r); \ s2 = vec_sr(s4, vfour); \ s3 = vec_sr(s5, vfour); \ \ c0 = vec_packsu(s2, s3); \ vec_st(c0, dst_offs, dst); \ src_offs += src_stride; \ dst_offs += dst_stride; COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE COMBINE } /* NYI */ void vp8_variance_and_sad_16x16_vmx(unsigned char *src1, int stride1, unsigned char *src2, int stride2, unsigned int *variance, unsigned int *sad) { vector short t128 = vec_sl(vec_splat_u16(8), vec_splat_u16(4)); }