mirror of
https://github.com/classilla/tenfourfox.git
synced 2024-06-01 01:41:37 +00:00
1 line
4.3 KiB
C
1 line
4.3 KiB
C
/* Copyright 2018 Cameron Kaiser and Contributors to TenFourFox.
|
|
All rights reserved. */
|
|
|
|
#include <stddef.h>
|
|
#include "./vp8_rtcd.h"
|
|
|
|
#ifndef __ALTIVEC__
|
|
#error VMX being compiled on non-VMX platform
|
|
#else
|
|
#include <altivec.h>
|
|
#endif
|
|
|
|
/* Notes:
|
|
|
|
movdqa xmm0, [rdi] // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1)
|
|
movdqa xmm1, xmm0 // (raw 0x00fffefdfcfbfaf9f8f7f6f5f4f3f2f1), same
|
|
pxor xmm2, xmm2 // (raw 0x0)
|
|
|
|
punpcklbw xmm0, xmm2 // converts low eight bytes to shorts
|
|
// (raw 0x0000ff00fe00fd00fc00fb00fa00f900)
|
|
// if with itself:
|
|
// (raw 0x0000fffffefefdfdfcfcfbfbfafaf9f9)
|
|
punpckhbw xmm1, xmm2 // converts high eight bytes to shorts
|
|
// (raw 0xf800f700f600f500f400f300f200f100)
|
|
// if with itself:
|
|
// (raw 0xf8f8f7f7f6f6f5f5f4f4f3f3f2f2f1f1)
|
|
|
|
mov rax, 0x7a // (raw 0x7a000000000000000000000000000000)
|
|
movd xmm0, rax
|
|
pshuflw xmm0, xmm0, 0x0
|
|
// (raw 0x7a007a007a007a000000000000000000)
|
|
punpcklqdq xmm0, xmm0
|
|
// (raw 0x7a007a007a007a007a007a007a007a00)
|
|
|
|
*/
|
|
|
|
// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)
|
|
|
|
// Basic notion. Guaranteed to work at any offset.
|
|
#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }
|
|
|
|
// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However,
|
|
// this may have issues loading at really weird addresses if they're not
|
|
// minimally word-aligned.
|
|
#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
|
|
// Equivalent for _mm_cvtsi128_si32.
|
|
#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
|
|
// Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now.
|
|
#define _unaligned_load64(v,s) _unaligned_load128(v,s)
|
|
// Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements.
|
|
#define _unaligned_store64(v,vv,s) {\
|
|
vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\
|
|
vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
|
|
}
|
|
|
|
/* XXX: not currently used; CONFIG_POSTPROC=0 */
|
|
void vp8_filter_by_weight16x16_vmx(unsigned char *src,
|
|
int src_stride,
|
|
unsigned char *dst,
|
|
int dst_stride,
|
|
int src_weight)
|
|
{
|
|
short sw = (short)src_weight;
|
|
vector short tmfqe_r = vec_splat_u16(8);
|
|
vector unsigned short vfour = vec_splat_u16(4);
|
|
vector short s0, s1, s2, s2h, s2l, s3, s3h, s3l, s4, s4h, s4l, s5, s5h, s5l, s6;
|
|
vector unsigned char c0, c1, c2, c3;
|
|
vector short v1 = vec_add(tmfqe_r, tmfqe_r);
|
|
vector short v0 = { sw, sw, sw, sw, sw, sw, sw, sw };
|
|
vector short vzero = vec_splat_u16(0);
|
|
ptrdiff_t src_offs = 0;
|
|
ptrdiff_t dst_offs = 0;
|
|
|
|
// v0 = src_weight, s1 = dst_weight
|
|
s1 = vec_sub(v1, v0);
|
|
|
|
#define COMBINE \
|
|
c2 = vec_ld(src_offs, src); \
|
|
c3 = vec_ld(dst_offs, dst); \
|
|
s2h = vec_mergeh((vector unsigned char)vzero, c2); /* src *= src_weight, dst *= dst_weight */ \
|
|
s3h = vec_mergeh((vector unsigned char)vzero, c3); \
|
|
s2l = vec_mergel((vector unsigned char)vzero, c2); \
|
|
s3l = vec_mergel((vector unsigned char)vzero, c3); \
|
|
s4h = vec_madds(s2h, v0, vzero); \
|
|
s5h = vec_madds(s3h, s1, vzero); \
|
|
s4l = vec_madds(s2l, v0, vzero); \
|
|
s5l = vec_madds(s2l, s1, vzero); \
|
|
s2 = vec_add(s2h, s3h); \
|
|
s3 = vec_add(s2l, s3l); \
|
|
s4 = vec_add(s2, tmfqe_r); \
|
|
s5 = vec_add(s3, tmfqe_r); \
|
|
s2 = vec_sr(s4, vfour); \
|
|
s3 = vec_sr(s5, vfour); \
|
|
\
|
|
c0 = vec_packsu(s2, s3); \
|
|
vec_st(c0, dst_offs, dst); \
|
|
src_offs += src_stride; \
|
|
dst_offs += dst_stride;
|
|
|
|
COMBINE COMBINE COMBINE COMBINE
|
|
COMBINE COMBINE COMBINE COMBINE
|
|
COMBINE COMBINE COMBINE COMBINE
|
|
COMBINE COMBINE COMBINE COMBINE
|
|
|
|
}
|
|
|
|
/* NYI */
|
|
void vp8_variance_and_sad_16x16_vmx(unsigned char *src1,
|
|
int stride1,
|
|
unsigned char *src2,
|
|
int stride2,
|
|
unsigned int *variance,
|
|
unsigned int *sad)
|
|
{
|
|
vector short t128 = vec_sl(vec_splat_u16(8), vec_splat_u16(4));
|
|
}
|