mirror of
https://github.com/classilla/tenfourfox.git
synced 2024-06-06 22:29:34 +00:00
954 lines
24 KiB
C
954 lines
24 KiB
C
/*
|
|
* Copyright (c) 2017 Cameron Kaiser and Contributors to TenFourFox
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "./vp9_rtcd.h"
|
|
#include "vpx_ports/mem.h"
|
|
|
|
#ifndef __ALTIVEC__
|
|
#error VMX being compiled on non-VMX platform
|
|
#else
|
|
#include <altivec.h>
|
|
#endif
|
|
|
|
/* AltiVec-accelerated VP9 intra frame prediction for big-endian 32-bit PowerPC. */
|
|
|
|
// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)
|
|
|
|
// Basic notion. Guaranteed to work at any offset.
|
|
#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }
|
|
|
|
// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However,
|
|
// this may have issues loading at really weird addresses if they're not
|
|
// minimally word-aligned.
|
|
#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
|
|
// Equivalent for _mm_cvtsi128_si32.
|
|
#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
|
|
// Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now.
|
|
#define _unaligned_load64(v,s) _unaligned_load128(v,s)
|
|
// Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements.
|
|
#define _unaligned_store64(v,vv,s) {\
|
|
vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\
|
|
vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
|
|
}
|
|
|
|
/* 4x4 */
|
|
|
|
void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
|
const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Descended from the MMX version, so unaligned. :(
|
|
|
|
vector unsigned int m0, m1, m2, m3, m4, m5;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
vector unsigned int vfour = vec_splat_u32(4);
|
|
vector unsigned int vthree = vec_splat_u32(3);
|
|
|
|
_unaligned_load32(m0, (uint32_t *)above);
|
|
_unaligned_load32(m1, (uint32_t *)left);
|
|
m2 = vec_mergeh(m0, m1); // punpckldq
|
|
|
|
// The Intel MMX version computes a sum of absolute differences
|
|
// against a vector of zero, so this is really just a cross sum.
|
|
|
|
m3 = vec_sum4s((vector unsigned char)m2, vzero);
|
|
m4 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
|
|
// Leave as 32-bit. Compute on that.
|
|
m0 = vec_add(m4, vfour);
|
|
m5 = vec_sra(m0, vthree);
|
|
|
|
// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
|
|
s0 = vec_packsu(m5, m5);
|
|
// ENDIAN NOTE!
|
|
// We splat position *1* because we were working on the low-order 64 bits.
|
|
// Since our 32-bit result was in the higher word of the low 64 bits, it's
|
|
// index 1, and since we just shifted down, it's *still* index 1.
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
// This is faster than a whole bunch of _unaligned_store32s because we
|
|
// already splatted the vector, so it's the same at all positions.
|
|
vec_ste((vector unsigned int)c0, 0, (uint32_t *)dst);
|
|
vec_ste((vector unsigned int)c0, y_stride, (uint32_t *)dst);
|
|
vec_ste((vector unsigned int)c0, y_stride + y_stride, (uint32_t *)dst);
|
|
vec_ste((vector unsigned int)c0, y_stride + y_stride + y_stride, (uint32_t *)dst);
|
|
}
|
|
|
|
inline void _common_top_or_left_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
|
|
{
|
|
// Similar idea to the standard predictor, but one column only.
|
|
vector unsigned int m0, m1, m2, m3;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
vector unsigned int vtwo = vec_splat_u32(2);
|
|
|
|
_unaligned_load32(m1, (uint32_t *)what);
|
|
// Interpolate zero to clear out the upper bits so that we get a
|
|
// proper cross-sum over the full range.
|
|
m0 = vec_mergeh(m1, vzero);
|
|
m2 = vec_sum4s((vector unsigned char)m0, vzero);
|
|
m3 = (vector unsigned int)vec_sum2s((vector signed int)m2, (vector signed int)vzero);
|
|
m0 = vec_add(m3, vtwo);
|
|
m1 = vec_sra(m0, vtwo);
|
|
|
|
s0 = vec_packsu(m1, m1);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
vec_ste((vector unsigned int)c0, 0, (uint32_t *)dst);
|
|
vec_ste((vector unsigned int)c0, y_stride, (uint32_t *)dst);
|
|
vec_ste((vector unsigned int)c0, y_stride + y_stride, (uint32_t *)dst);
|
|
vec_ste((vector unsigned int)c0, y_stride + y_stride + y_stride, (uint32_t *)dst);
|
|
}
|
|
|
|
void vp9_dc_top_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_4x4_vmx(dst, y_stride, above);
|
|
}
|
|
void vp9_dc_left_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_4x4_vmx(dst, y_stride, left);
|
|
}
|
|
|
|
void vp9_dc_128_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Pretty much just blit 128s.
|
|
// Splatting to a char vector doesn't store properly, so splat
|
|
// to int.
|
|
vector unsigned int m0 = (vector unsigned int)vec_sl(vec_splat_u8(2), vec_splat_u8(6));
|
|
vec_ste(m0, 0, (uint32_t *)dst);
|
|
vec_ste(m0, y_stride, (uint32_t *)dst);
|
|
vec_ste(m0, y_stride + y_stride, (uint32_t *)dst);
|
|
vec_ste(m0, y_stride + y_stride + y_stride, (uint32_t *)dst);
|
|
}
|
|
|
|
void vp9_v_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Pretty much just copy.
|
|
|
|
vector unsigned int m0, m1;
|
|
_unaligned_load32(m1, (uint32_t *)above);
|
|
m0 = vec_splat(m1, 0);
|
|
vec_ste(m0, 0, (uint32_t *)dst);
|
|
vec_ste(m0, y_stride, (uint32_t *)dst);
|
|
vec_ste(m0, y_stride + y_stride, (uint32_t *)dst);
|
|
vec_ste(m0, y_stride + y_stride + y_stride, (uint32_t *)dst);
|
|
}
|
|
|
|
void vp9_h_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Expand sequence
|
|
// aa bb cc dd -- -- -- -- -- ....
|
|
// to
|
|
// aa aa aa aa bb bb bb bb cc ....
|
|
// This can be done with just splats.
|
|
|
|
vector unsigned char c0;
|
|
vector unsigned int m0, m1, m2, m3;
|
|
vector unsigned char vzero = vec_splat_u8(0);
|
|
|
|
_unaligned_load32(c0, (uint32_t *)left);
|
|
|
|
m0 = (vector unsigned int)vec_splat(c0, 0);
|
|
m1 = (vector unsigned int)vec_splat(c0, 1);
|
|
vec_ste(m0, 0, (uint32_t *)dst);
|
|
m2 = (vector unsigned int)vec_splat(c0, 2);
|
|
vec_ste(m1, y_stride, (uint32_t *)dst);
|
|
m3 = (vector unsigned int)vec_splat(c0, 3);
|
|
vec_ste(m2, y_stride + y_stride, (uint32_t *)dst);
|
|
vec_ste(m3, y_stride + y_stride + y_stride, (uint32_t *)dst);
|
|
}
|
|
|
|
#if(0)
|
|
// This doesn't work properly, and the large amount of unaligned
|
|
// memory access in the True Motion predictors makes them a poor
|
|
// fit for AltiVec.
|
|
void vp9_tm_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Get the last top value from above - 1 and splat it ("tl").
|
|
// Using that preceding byte, compute t[i]-tl+l[i] for the
|
|
// appropriate block size where t = top and l = left with our
|
|
// tl vector above.
|
|
|
|
vector unsigned char c0, c1, c2, c3, c4;
|
|
vector unsigned short s0, s1, s2, s3, s4, s5, s6, tl;
|
|
vector unsigned int m0;
|
|
vector unsigned char vzero = vec_splat_u8(0);
|
|
ptrdiff_t offs = 0;
|
|
|
|
// This can load at really weird addresses, so our
|
|
// faster unaligned load32 macro is not sufficient.
|
|
_unaligned_load128(c0, (uint32_t *)(above - 1));
|
|
c1 = vec_splat(c0, 0);
|
|
tl = vec_mergeh(vzero, c1);
|
|
|
|
// Expand t to short and subtract tl.
|
|
_unaligned_load128(c0, (uint32_t *)above);
|
|
s1 = vec_mergeh(vzero, c0);
|
|
s0 = vec_sub(s1, tl);
|
|
|
|
#define TM_2X2(x) \
|
|
_unaligned_load128(c2, (uint32_t *)(left + 4 - x)); \
|
|
_unaligned_load128(c3, (uint32_t *)(left + 5 - x)); \
|
|
s2 = vec_mergeh(vzero, c2); \
|
|
s3 = vec_mergeh(vzero, c3); \
|
|
s4 = vec_splat(s2, 0); \
|
|
s5 = vec_splat(s3, 0); \
|
|
s2 = vec_add(s0, s4); \
|
|
s3 = vec_add(s0, s5); \
|
|
c2 = vec_packsu(s2, s2); \
|
|
c3 = vec_packsu(s3, s3); \
|
|
c4 = vec_perm(c2, c2, vec_lvsr(0, (uint32_t *)(dst + offs))); \
|
|
vec_ste(c4, 0, (dst + offs)); \
|
|
vec_ste(c4, 1, (dst + offs)); \
|
|
vec_ste(c4, 2, (dst + offs)); \
|
|
vec_ste(c4, 3, (dst + offs)); \
|
|
c4 = vec_perm(c3, c3, vec_lvsr(0, (uint32_t *)(dst + offs + y_stride))); \
|
|
vec_ste(c4, 0, (dst + offs + y_stride)); \
|
|
vec_ste(c4, 1, (dst + offs + y_stride)); \
|
|
vec_ste(c4, 2, (dst + offs + y_stride)); \
|
|
vec_ste(c4, 3, (dst + offs + y_stride)); \
|
|
offs += y_stride + y_stride;
|
|
|
|
TM_2X2(4)
|
|
TM_2X2(2)
|
|
#undef TM_2X2
|
|
}
|
|
#endif
|
|
|
|
/* 8x8 */
|
|
|
|
void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
|
const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Descended from the MMX version, so unaligned. :(
|
|
|
|
vector unsigned int m0, m1, m2, m3, m4, m5, m6;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
ptrdiff_t offs = 0;
|
|
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
vector unsigned int vfour = vec_splat_u32(4);
|
|
vector unsigned int veight = vec_splat_u32(8);
|
|
|
|
_unaligned_load64(m0, (uint32_t *)above);
|
|
_unaligned_load64(m1, (uint32_t *)left);
|
|
|
|
// Same as above, an SAD calculation against a zero vector, but twice.
|
|
m3 = vec_sum4s((vector unsigned char)m0, vzero);
|
|
m5 = vec_sum4s((vector unsigned char)m1, vzero);
|
|
m4 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
|
|
m6 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
|
|
// Continue computations in 32-bit pending pack/splat/pack.
|
|
m1 = vec_adds(m4, m6);
|
|
m0 = vec_adds(m1, veight);
|
|
m5 = vec_sra(m0, vfour);
|
|
|
|
// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
|
|
s0 = vec_packsu(m5, m5);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
// Again, faster than repeated _unaligned_store64s since we
|
|
// already splatted.
|
|
#define NEXT vec_ste((vector unsigned int)c0, offs, (uint32_t *)dst); vec_ste((vector unsigned int)c0, 4+offs, (uint32_t *)dst); offs+=y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
inline void _common_top_or_left_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
|
|
{
|
|
// Again, single column variant.
|
|
vector unsigned int m0, m1, m2, m3, m4;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
ptrdiff_t offs = 0;
|
|
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
vector unsigned int vfour = vec_splat_u32(4);
|
|
vector unsigned int vthree = vec_splat_u32(3);
|
|
|
|
_unaligned_load64(m0, (uint32_t *)what);
|
|
|
|
// Since all these functions from here load at least 64 bits, we don't need
|
|
// to interpolate zero anymore to clear out the other half for the SAD.
|
|
m1 = vec_sum4s((vector unsigned char)m0, vzero);
|
|
m2 = (vector unsigned int)vec_sum2s((vector signed int)m1, (vector signed int)vzero);
|
|
m3 = vec_adds(m2, vfour);
|
|
m4 = vec_sra(m3, vthree);
|
|
|
|
s0 = vec_packsu(m4, m4);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
#define NEXT vec_ste((vector unsigned int)c0, offs, (uint32_t *)dst); vec_ste((vector unsigned int)c0, 4+offs, (uint32_t *)dst); offs+=y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_dc_top_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_8x8_vmx(dst, y_stride, above);
|
|
}
|
|
void vp9_dc_left_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_8x8_vmx(dst, y_stride, left);
|
|
}
|
|
|
|
void vp9_dc_128_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Yup, blitting 128s again.
|
|
vector unsigned int m0 = (vector unsigned int)vec_sl(vec_splat_u8(2), vec_splat_u8(6));
|
|
ptrdiff_t offs = 0;
|
|
|
|
#define NEXT vec_ste(m0, offs, (uint32_t *)dst); vec_ste(m0, 4+offs, (uint32_t *)dst); offs+=y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_v_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Yup, copying again.
|
|
vector unsigned int m0, m1;
|
|
ptrdiff_t offs = 0;
|
|
|
|
_unaligned_load64(m1, (uint32_t *)above);
|
|
m0 = vec_splat(m1, 0);
|
|
m1 = vec_splat(m1, 1);
|
|
|
|
#define NEXT vec_ste(m0, offs, (uint32_t *)dst); vec_ste(m1, 4+offs, (uint32_t *)dst); offs+=y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_h_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Expand sequence
|
|
// aa bb cc dd ee ff gg hh -- -- -- -- -- ....
|
|
// to
|
|
// aa aa aa aa bb bb bb bb cc cc cc cc dd ....
|
|
|
|
vector unsigned char c0;
|
|
vector unsigned int m0, m1, m2;
|
|
ptrdiff_t offs = 0;
|
|
|
|
_unaligned_load64(c0, (uint32_t *)left);
|
|
|
|
#define STORE(x) vec_ste(x, offs, (uint32_t *)dst); vec_ste(x, 4+offs, (uint32_t *)dst); offs+=y_stride;
|
|
m0 = (vector unsigned int)vec_splat(c0, 0);
|
|
m1 = (vector unsigned int)vec_splat(c0, 1);
|
|
STORE(m0)
|
|
m2 = (vector unsigned int)vec_splat(c0, 2);
|
|
STORE(m1)
|
|
m0 = (vector unsigned int)vec_splat(c0, 3);
|
|
STORE(m2)
|
|
m1 = (vector unsigned int)vec_splat(c0, 4);
|
|
STORE(m0)
|
|
m2 = (vector unsigned int)vec_splat(c0, 5);
|
|
STORE(m1)
|
|
m0 = (vector unsigned int)vec_splat(c0, 6);
|
|
STORE(m2)
|
|
m1 = (vector unsigned int)vec_splat(c0, 7);
|
|
STORE(m0)
|
|
STORE(m1)
|
|
#undef STORE
|
|
}
|
|
|
|
/* 16x16 */
|
|
|
|
void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
|
const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Finally, alignment! The use of movdqa in the Intel SSE2 version
|
|
// for both loads and stores implies we can safely use aligned
|
|
// loads and stores here as well.
|
|
|
|
vector unsigned int m0, m1, m2, m3, m4, m5, m6;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
ptrdiff_t offs = 0;
|
|
|
|
vector unsigned int v16 = vec_splat_u32(8); // Computed momentarily
|
|
vector unsigned int vone = vec_splat_u32(1);
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
vector unsigned int vfive = vec_splat_u32(5);
|
|
|
|
m0 = vec_ld(0, (uint32_t *)above);
|
|
m1 = vec_ld(0, (uint32_t *)left);
|
|
|
|
// The SSE2 version starts using 32-bit words, as we do.
|
|
m2 = vec_sum4s((vector unsigned char)m0, vzero);
|
|
m3 = vec_sum4s((vector unsigned char)m1, vzero);
|
|
m2 = (vector unsigned int)vec_sum2s((vector signed int)m2, (vector signed int)vzero);
|
|
m3 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
|
|
|
|
v16 = vec_sl(v16, vone);
|
|
m4 = vec_adds(m2, m3);
|
|
|
|
// Combine 64 bits of m3 with m4 (equivalent to movhlps).
|
|
m5 = vec_sld(m3, m3, 8);
|
|
m6 = vec_sld(m4, m5, 8);
|
|
|
|
m0 = vec_adds(m4, m6);
|
|
m1 = vec_adds(m0, v16);
|
|
m2 = vec_sra(m1, vfive);
|
|
|
|
// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
|
|
s0 = vec_packsu(m2, m2);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
// 16 stores
|
|
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void _common_top_or_left_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
|
|
{
|
|
vector unsigned int m0, m1, m2, m3;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
ptrdiff_t offs = 0;
|
|
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
vector unsigned int vfour = vec_splat_u32(4);
|
|
vector unsigned int veight = vec_splat_u32(8);
|
|
|
|
m0 = vec_ld(0, (uint32_t *)what);
|
|
|
|
// The Intel version is identical to the full 16x16 except for
|
|
// zeroing out the additional vector; otherwise it does all the
|
|
// same computations. This is clearly wasteful, so I've elided
|
|
// them here. In particular, an SAD of zero against zero will
|
|
// always be zero, so we can just drop one of the SADs right now.
|
|
m1 = vec_sum4s((vector unsigned char)m0, vzero);
|
|
m2 = (vector unsigned int)vec_sum2s((vector signed int)m1, (vector signed int)vzero);
|
|
|
|
// Also, we don't need the full movhlps steps because the other vector
|
|
// will always be zero, so only one vector shift is required.
|
|
m3 = vec_sld(m2, vzero, 8);
|
|
|
|
m0 = vec_adds(m3, m2);
|
|
m1 = vec_adds(m0, veight);
|
|
m2 = vec_sra(m1, vfour);
|
|
|
|
s0 = vec_packsu(m2, m2);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
// 16 stores
|
|
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_dc_top_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_16x16_vmx(dst, y_stride, above);
|
|
}
|
|
void vp9_dc_left_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_16x16_vmx(dst, y_stride, left);
|
|
}
|
|
|
|
void vp9_dc_128_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Mmmm, 128 splat splat splat.
|
|
vector unsigned char c0 = vec_sl(vec_splat_u8(2), vec_splat_u8(6));
|
|
ptrdiff_t offs = 0;
|
|
|
|
// 16 stores
|
|
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_v_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Mmmm, vector copy copy copy.
|
|
vector unsigned char c0 = vec_ld(0, above);
|
|
ptrdiff_t offs = 0;
|
|
|
|
// 16 stores
|
|
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_h_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Expand an entire 16-byte vector to 16 splatted vectors.
|
|
// Unfortunately, the load is not aligned, but the stores are.
|
|
vector unsigned char c0, c1, c2, c3;
|
|
ptrdiff_t offs = 0;
|
|
|
|
_unaligned_load128(c0, left);
|
|
|
|
// 16 stores
|
|
#define SPLAT(n,x) x = vec_splat(c0, n);
|
|
#define STORE(x) vec_st(x, offs, dst); offs += y_stride;
|
|
|
|
SPLAT(0,c1)
|
|
SPLAT(1,c2)
|
|
STORE(c1)
|
|
SPLAT(2,c3)
|
|
STORE(c2)
|
|
SPLAT(3,c1)
|
|
STORE(c3)
|
|
SPLAT(4,c2)
|
|
STORE(c1)
|
|
SPLAT(5,c3)
|
|
STORE(c2)
|
|
SPLAT(6,c1)
|
|
STORE(c3)
|
|
SPLAT(7,c2)
|
|
STORE(c1)
|
|
SPLAT(8,c3)
|
|
STORE(c2)
|
|
SPLAT(9,c1)
|
|
STORE(c3)
|
|
SPLAT(10,c2)
|
|
STORE(c1)
|
|
SPLAT(11,c3)
|
|
STORE(c2)
|
|
SPLAT(12,c1)
|
|
STORE(c3)
|
|
SPLAT(13,c2)
|
|
STORE(c1)
|
|
SPLAT(14,c3)
|
|
STORE(c2)
|
|
SPLAT(15,c1)
|
|
STORE(c3)
|
|
STORE(c1)
|
|
|
|
#undef STORE
|
|
#undef SPLAT
|
|
}
|
|
|
|
/* 32x32 */
|
|
|
|
void vp9_dc_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
|
const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Also aligned.
|
|
// Approximately the same routine, but double-pumped.
|
|
|
|
vector unsigned int m0, m1, m2, m3, m4, m5, m6, m7, m8;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
ptrdiff_t offs = 0;
|
|
|
|
vector unsigned int v32 = vec_splat_u32(8); // Computed momentarily
|
|
vector unsigned int vtwo = vec_splat_u32(2);
|
|
vector unsigned int vsix = vec_splat_u32(6);
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
|
|
m1 = vec_ld(0, (uint32_t *)above);
|
|
m2 = vec_ld(16, (uint32_t *)above);
|
|
m3 = vec_ld(0, (uint32_t *)left);
|
|
m4 = vec_ld(16, (uint32_t *)left);
|
|
|
|
v32 = vec_sl(v32, vtwo);
|
|
|
|
m5 = vec_sum4s((vector unsigned char)m1, vzero);
|
|
m6 = vec_sum4s((vector unsigned char)m2, vzero);
|
|
m7 = vec_sum4s((vector unsigned char)m3, vzero);
|
|
m8 = vec_sum4s((vector unsigned char)m4, vzero);
|
|
|
|
m1 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
|
|
m2 = (vector unsigned int)vec_sum2s((vector signed int)m6, (vector signed int)vzero);
|
|
m3 = (vector unsigned int)vec_sum2s((vector signed int)m7, (vector signed int)vzero);
|
|
m4 = (vector unsigned int)vec_sum2s((vector signed int)m8, (vector signed int)vzero);
|
|
|
|
m5 = vec_adds(m1, m2);
|
|
m6 = vec_adds(m3, m4);
|
|
m0 = vec_adds(m5, m6);
|
|
|
|
m1 = vec_sld(m2, m2, 8);
|
|
m3 = vec_sld(m0, m1, 8);
|
|
|
|
m4 = vec_adds(m3, m0);
|
|
m1 = vec_adds(m4, v32);
|
|
m2 = vec_sra(m1, vsix);
|
|
|
|
s0 = vec_packsu(m2, m2);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
// 32 stores
|
|
#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void _common_top_or_left_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
|
|
{
|
|
// This actually has slightly different logic.
|
|
vector unsigned int m0, m1, m2, m3, m4, m5, m6;
|
|
vector unsigned short s0, s1;
|
|
vector unsigned char c0;
|
|
ptrdiff_t offs = 0;
|
|
|
|
vector unsigned int v16 = vec_splat_u32(8); // Computed momentarily
|
|
vector unsigned int vfive = vec_splat_u32(5);
|
|
vector unsigned int vzero = vec_splat_u32(0);
|
|
|
|
m0 = vec_ld(0, (uint32_t *)what);
|
|
m2 = vec_ld(16, (uint32_t *)what);
|
|
|
|
v16 = vec_add(v16, v16);
|
|
|
|
m5 = vec_sum4s((vector unsigned char)m0, vzero);
|
|
m6 = vec_sum4s((vector unsigned char)m2, vzero);
|
|
m4 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
|
|
m2 = (vector unsigned int)vec_sum2s((vector signed int)m6, (vector signed int)vzero);
|
|
|
|
m0 = vec_adds(m4, m2);
|
|
|
|
m1 = vec_sld(m2, m2, 8);
|
|
m3 = vec_sld(m0, m1, 8);
|
|
|
|
m4 = vec_adds(m3, m0);
|
|
m1 = vec_adds(m4, v16);
|
|
m2 = vec_sra(m1, vfive);
|
|
|
|
s0 = vec_packsu(m2, m2);
|
|
s1 = vec_splat(s0, 1);
|
|
c0 = vec_packsu(s1, s1);
|
|
|
|
// 32 stores
|
|
#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_dc_top_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_32x32_vmx(dst, y_stride, above);
|
|
}
|
|
void vp9_dc_left_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
_common_top_or_left_32x32_vmx(dst, y_stride, left);
|
|
}
|
|
|
|
void vp9_dc_128_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Oh baby, I love to feel those creamy 128s ru... um, sorry.
|
|
// What were we doing again?
|
|
vector unsigned char c0 = vec_sl(vec_splat_u8(2), vec_splat_u8(6));
|
|
ptrdiff_t offs = 0;
|
|
|
|
// 32 stores
|
|
#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_v_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Is it hot in here or is it just me?
|
|
// Oh, right, copying even more data.
|
|
vector unsigned char c0 = vec_ld(0, above);
|
|
vector unsigned char c1 = vec_ld(16, above);
|
|
ptrdiff_t offs = 0;
|
|
|
|
// 32 stores
|
|
#define NEXT vec_st(c0, offs, dst); vec_st(c1, offs+16, dst); offs += y_stride;
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
NEXT NEXT
|
|
|
|
#undef NEXT
|
|
}
|
|
|
|
void vp9_h_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
|
|
{
|
|
// Two 16-byte vectors to 32 splatted vectors.
|
|
// Again, the load doesn't seem to be aligned :(
|
|
// Simplest to run this as a double-pumped 16x16 routine.
|
|
vector unsigned char c0, c1, c2, c3;
|
|
ptrdiff_t offs = 0;
|
|
|
|
_unaligned_load128(c0, left);
|
|
|
|
// 32 stores
|
|
#define SPLAT(n,x) x = vec_splat(c0, n);
|
|
#define STORE(x) vec_st(x, offs, dst); vec_st(x, 16+offs, dst); offs += y_stride;
|
|
|
|
SPLAT(0,c1)
|
|
SPLAT(1,c2)
|
|
STORE(c1)
|
|
SPLAT(2,c3)
|
|
STORE(c2)
|
|
SPLAT(3,c1)
|
|
STORE(c3)
|
|
SPLAT(4,c2)
|
|
STORE(c1)
|
|
SPLAT(5,c3)
|
|
STORE(c2)
|
|
SPLAT(6,c1)
|
|
STORE(c3)
|
|
SPLAT(7,c2)
|
|
STORE(c1)
|
|
SPLAT(8,c3)
|
|
STORE(c2)
|
|
SPLAT(9,c1)
|
|
STORE(c3)
|
|
SPLAT(10,c2)
|
|
STORE(c1)
|
|
SPLAT(11,c3)
|
|
STORE(c2)
|
|
SPLAT(12,c1)
|
|
STORE(c3)
|
|
SPLAT(13,c2)
|
|
STORE(c1)
|
|
SPLAT(14,c3)
|
|
STORE(c2)
|
|
SPLAT(15,c1)
|
|
STORE(c3)
|
|
_unaligned_load128(c0, left + 16);
|
|
STORE(c1)
|
|
|
|
// 32 more stores
|
|
SPLAT(0,c1)
|
|
SPLAT(1,c2)
|
|
STORE(c1)
|
|
SPLAT(2,c3)
|
|
STORE(c2)
|
|
SPLAT(3,c1)
|
|
STORE(c3)
|
|
SPLAT(4,c2)
|
|
STORE(c1)
|
|
SPLAT(5,c3)
|
|
STORE(c2)
|
|
SPLAT(6,c1)
|
|
STORE(c3)
|
|
SPLAT(7,c2)
|
|
STORE(c1)
|
|
SPLAT(8,c3)
|
|
STORE(c2)
|
|
SPLAT(9,c1)
|
|
STORE(c3)
|
|
SPLAT(10,c2)
|
|
STORE(c1)
|
|
SPLAT(11,c3)
|
|
STORE(c2)
|
|
SPLAT(12,c1)
|
|
STORE(c3)
|
|
SPLAT(13,c2)
|
|
STORE(c1)
|
|
SPLAT(14,c3)
|
|
STORE(c2)
|
|
SPLAT(15,c1)
|
|
STORE(c3)
|
|
STORE(c1)
|
|
|
|
#undef STORE
|
|
#undef SPLAT
|
|
} |