tenfourfox/media/libvpx/vp8/common/ppc/vp8_intrinsics_vmx.c
2017-11-30 22:05:32 -08:00

1 line
4.3 KiB
C

/* Copyright 2018 Cameron Kaiser and Contributors to TenFourFox.
All rights reserved. */
#include <stddef.h>
#include "./vp8_rtcd.h"
#ifndef __ALTIVEC__
#error VMX being compiled on non-VMX platform
#else
#include <altivec.h>
#endif
/* Notes:
movdqa xmm0, [rdi] // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1)
movdqa xmm1, xmm0 // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1), same
pxor xmm2, xmm2 // (raw 0x0)
punpcklbw xmm0, xmm2 // converts low eight bytes to shorts
// (raw 0x0000ff00fe00fd00fc00fb00fa00f900)
// if with itself:
// (raw 0x0000fffffefefdfdfcfcfbfbfafaf9f9)
punpckhbw xmm1, xmm2 // converts high eight bytes to shorts
// (raw 0xf800f700f600f500f400f300f200f100)
// if with itself:
// (raw 0xf8f8f7f7f6f6f5f5f4f4f3f3f2f2f1f1)
mov rax, 0x7a // (raw 0x7a000000000000000000000000000000)
movd xmm0, rax
pshuflw xmm0, xmm0, 0x0
// (raw 0x7a007a007a007a000000000000000000)
punpcklqdq xmm0, xmm0
// (raw 0x7a007a007a007a007a007a007a007a00)
*/
// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)
// Basic notion. Guaranteed to work at any offset.
#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }
// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However,
// this may have issues loading at really weird addresses if they're not
// minimally word-aligned.
#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
// Equivalent for _mm_cvtsi128_si32.
#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
// Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now.
#define _unaligned_load64(v,s) _unaligned_load128(v,s)
// Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements.
#define _unaligned_store64(v,vv,s) {\
vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\
vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
}
/* XXX: not currently used; CONFIG_POSTPROC=0 */
void vp8_filter_by_weight16x16_vmx(unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride,
int src_weight)
{
short sw = (short)src_weight;
vector short tmfqe_r = vec_splat_u16(8);
vector unsigned short vfour = vec_splat_u16(4);
vector short s0, s1, s2, s2h, s2l, s3, s3h, s3l, s4, s4h, s4l, s5, s5h, s5l, s6;
vector unsigned char c0, c1, c2, c3;
vector short v1 = vec_add(tmfqe_r, tmfqe_r);
vector short v0 = { sw, sw, sw, sw, sw, sw, sw, sw };
vector short vzero = vec_splat_u16(0);
ptrdiff_t src_offs = 0;
ptrdiff_t dst_offs = 0;
// v0 = src_weight, s1 = dst_weight
s1 = vec_sub(v1, v0);
#define COMBINE \
c2 = vec_ld(src_offs, src); \
c3 = vec_ld(dst_offs, dst); \
s2h = vec_mergeh((vector unsigned char)vzero, c2); /* src *= src_weight, dst *= dst_weight */ \
s3h = vec_mergeh((vector unsigned char)vzero, c3); \
s2l = vec_mergel((vector unsigned char)vzero, c2); \
s3l = vec_mergel((vector unsigned char)vzero, c3); \
s4h = vec_madds(s2h, v0, vzero); \
s5h = vec_madds(s3h, s1, vzero); \
s4l = vec_madds(s2l, v0, vzero); \
s5l = vec_madds(s2l, s1, vzero); \
s2 = vec_add(s2h, s3h); \
s3 = vec_add(s2l, s3l); \
s4 = vec_add(s2, tmfqe_r); \
s5 = vec_add(s3, tmfqe_r); \
s2 = vec_sr(s4, vfour); \
s3 = vec_sr(s5, vfour); \
\
c0 = vec_packsu(s2, s3); \
vec_st(c0, dst_offs, dst); \
src_offs += src_stride; \
dst_offs += dst_stride;
COMBINE COMBINE COMBINE COMBINE
COMBINE COMBINE COMBINE COMBINE
COMBINE COMBINE COMBINE COMBINE
COMBINE COMBINE COMBINE COMBINE
}
/* NYI */
void vp8_variance_and_sad_16x16_vmx(unsigned char *src1,
int stride1,
unsigned char *src2,
int stride2,
unsigned int *variance,
unsigned int *sad)
{
vector short t128 = vec_sl(vec_splat_u16(8), vec_splat_u16(4));
}