#324: complete intra predictors for h/v/dc/128

This commit is contained in:
Cameron Kaiser 2017-11-22 17:27:20 -08:00
parent f6c2519506
commit 7bfd387acb
2 changed files with 815 additions and 32 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2018 Cameron Kaiser and Contributors to TenFourFox
* Copyright (c) 2017 Cameron Kaiser and Contributors to TenFourFox
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@ -21,10 +21,12 @@
// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)
// Basic notion.
// Basic notion. Guaranteed to work at any offset.
#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }
// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.)
// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However,
// this may have issues loading at really weird addresses if they're not
// minimally word-aligned.
#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
// Equivalent for _mm_cvtsi128_si32.
#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
@ -36,6 +38,8 @@
vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
}
/* 4x4 */
void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
const uint8_t *above, const uint8_t *left)
{
@ -71,12 +75,160 @@ void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
_unaligned_store32(c0, m0, (uint32_t *)dst);
_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride));
_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride));
_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride + y_stride));
// This is faster than a whole bunch of _unaligned_store32s because we
// already splatted the vector, so it's the same at all positions.
vec_ste((vector unsigned int)c0, 0, (uint32_t *)dst);
vec_ste((vector unsigned int)c0, y_stride, (uint32_t *)dst);
vec_ste((vector unsigned int)c0, y_stride + y_stride, (uint32_t *)dst);
vec_ste((vector unsigned int)c0, y_stride + y_stride + y_stride, (uint32_t *)dst);
}
inline void _common_top_or_left_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
{
// Similar idea to the standard predictor, but one column only.
vector unsigned int m0, m1, m2, m3;
vector unsigned short s0, s1;
vector unsigned char c0;
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned int vtwo = vec_splat_u32(2);
_unaligned_load32(m1, (uint32_t *)what);
// Interpolate zero to clear out the upper bits so that we get a
// proper cross-sum over the full range.
m0 = vec_mergeh(m1, vzero);
m2 = vec_sum4s((vector unsigned char)m0, vzero);
m3 = (vector unsigned int)vec_sum2s((vector signed int)m2, (vector signed int)vzero);
m0 = vec_add(m3, vtwo);
m1 = vec_sra(m0, vtwo);
s0 = vec_packsu(m1, m1);
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
vec_ste((vector unsigned int)c0, 0, (uint32_t *)dst);
vec_ste((vector unsigned int)c0, y_stride, (uint32_t *)dst);
vec_ste((vector unsigned int)c0, y_stride + y_stride, (uint32_t *)dst);
vec_ste((vector unsigned int)c0, y_stride + y_stride + y_stride, (uint32_t *)dst);
}
void vp9_dc_top_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_4x4_vmx(dst, y_stride, above);
}
void vp9_dc_left_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_4x4_vmx(dst, y_stride, left);
}
void vp9_dc_128_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Pretty much just blit 128s.
// Splatting to a char vector doesn't store properly, so splat
// to int.
vector unsigned int m0 = (vector unsigned int)vec_sl(vec_splat_u8(2), vec_splat_u8(6));
vec_ste(m0, 0, (uint32_t *)dst);
vec_ste(m0, y_stride, (uint32_t *)dst);
vec_ste(m0, y_stride + y_stride, (uint32_t *)dst);
vec_ste(m0, y_stride + y_stride + y_stride, (uint32_t *)dst);
}
void vp9_v_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Pretty much just copy.
vector unsigned int m0, m1;
_unaligned_load32(m1, (uint32_t *)above);
m0 = vec_splat(m1, 0);
vec_ste(m0, 0, (uint32_t *)dst);
vec_ste(m0, y_stride, (uint32_t *)dst);
vec_ste(m0, y_stride + y_stride, (uint32_t *)dst);
vec_ste(m0, y_stride + y_stride + y_stride, (uint32_t *)dst);
}
void vp9_h_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Expand sequence
// aa bb cc dd -- -- -- -- -- ....
// to
// aa aa aa aa bb bb bb bb cc ....
// This can be done with just splats.
vector unsigned char c0;
vector unsigned int m0, m1, m2, m3;
vector unsigned char vzero = vec_splat_u8(0);
_unaligned_load32(c0, (uint32_t *)left);
m0 = (vector unsigned int)vec_splat(c0, 0);
m1 = (vector unsigned int)vec_splat(c0, 1);
vec_ste(m0, 0, (uint32_t *)dst);
m2 = (vector unsigned int)vec_splat(c0, 2);
vec_ste(m1, y_stride, (uint32_t *)dst);
m3 = (vector unsigned int)vec_splat(c0, 3);
vec_ste(m2, y_stride + y_stride, (uint32_t *)dst);
vec_ste(m3, y_stride + y_stride + y_stride, (uint32_t *)dst);
}
#if(0)
// This doesn't work properly, and the large amount of unaligned
// memory access in the True Motion predictors makes them a poor
// fit for AltiVec.
void vp9_tm_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Get the last top value from above - 1 and splat it ("tl").
// Using that preceding byte, compute t[i]-tl+l[i] for the
// appropriate block size where t = top and l = left with our
// tl vector above.
vector unsigned char c0, c1, c2, c3, c4;
vector unsigned short s0, s1, s2, s3, s4, s5, s6, tl;
vector unsigned int m0;
vector unsigned char vzero = vec_splat_u8(0);
ptrdiff_t offs = 0;
// This can load at really weird addresses, so our
// faster unaligned load32 macro is not sufficient.
_unaligned_load128(c0, (uint32_t *)(above - 1));
c1 = vec_splat(c0, 0);
tl = vec_mergeh(vzero, c1);
// Expand t to short and subtract tl.
_unaligned_load128(c0, (uint32_t *)above);
s1 = vec_mergeh(vzero, c0);
s0 = vec_sub(s1, tl);
#define TM_2X2(x) \
_unaligned_load128(c2, (uint32_t *)(left + 4 - x)); \
_unaligned_load128(c3, (uint32_t *)(left + 5 - x)); \
s2 = vec_mergeh(vzero, c2); \
s3 = vec_mergeh(vzero, c3); \
s4 = vec_splat(s2, 0); \
s5 = vec_splat(s3, 0); \
s2 = vec_add(s0, s4); \
s3 = vec_add(s0, s5); \
c2 = vec_packsu(s2, s2); \
c3 = vec_packsu(s3, s3); \
c4 = vec_perm(c2, c2, vec_lvsr(0, (uint32_t *)(dst + offs))); \
vec_ste(c4, 0, (dst + offs)); \
vec_ste(c4, 1, (dst + offs)); \
vec_ste(c4, 2, (dst + offs)); \
vec_ste(c4, 3, (dst + offs)); \
c4 = vec_perm(c3, c3, vec_lvsr(0, (uint32_t *)(dst + offs + y_stride))); \
vec_ste(c4, 0, (dst + offs + y_stride)); \
vec_ste(c4, 1, (dst + offs + y_stride)); \
vec_ste(c4, 2, (dst + offs + y_stride)); \
vec_ste(c4, 3, (dst + offs + y_stride)); \
offs += y_stride + y_stride;
TM_2X2(4)
TM_2X2(2)
#undef TM_2X2
}
#endif
/* 8x8 */
void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
const uint8_t *above, const uint8_t *left)
{
@ -109,7 +261,9 @@ void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
#define NEXT _unaligned_store64(c0, m0, (uint32_t *)(dst+offs)); offs+=y_stride;
// Again, faster than repeated _unaligned_store64s since we
// already splatted.
#define NEXT vec_ste((vector unsigned int)c0, offs, (uint32_t *)dst); vec_ste((vector unsigned int)c0, 4+offs, (uint32_t *)dst); offs+=y_stride;
NEXT
NEXT
@ -123,6 +277,133 @@ void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
#undef NEXT
}
inline void _common_top_or_left_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
{
// Again, single column variant.
vector unsigned int m0, m1, m2, m3, m4;
vector unsigned short s0, s1;
vector unsigned char c0;
ptrdiff_t offs = 0;
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned int vfour = vec_splat_u32(4);
vector unsigned int vthree = vec_splat_u32(3);
_unaligned_load64(m0, (uint32_t *)what);
// Since all these functions from here load at least 64 bits, we don't need
// to interpolate zero anymore to clear out the other half for the SAD.
m1 = vec_sum4s((vector unsigned char)m0, vzero);
m2 = (vector unsigned int)vec_sum2s((vector signed int)m1, (vector signed int)vzero);
m3 = vec_adds(m2, vfour);
m4 = vec_sra(m3, vthree);
s0 = vec_packsu(m4, m4);
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
#define NEXT vec_ste((vector unsigned int)c0, offs, (uint32_t *)dst); vec_ste((vector unsigned int)c0, 4+offs, (uint32_t *)dst); offs+=y_stride;
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
#undef NEXT
}
void vp9_dc_top_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_8x8_vmx(dst, y_stride, above);
}
void vp9_dc_left_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_8x8_vmx(dst, y_stride, left);
}
void vp9_dc_128_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Yup, blitting 128s again.
vector unsigned int m0 = (vector unsigned int)vec_sl(vec_splat_u8(2), vec_splat_u8(6));
ptrdiff_t offs = 0;
#define NEXT vec_ste(m0, offs, (uint32_t *)dst); vec_ste(m0, 4+offs, (uint32_t *)dst); offs+=y_stride;
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
#undef NEXT
}
void vp9_v_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Yup, copying again.
vector unsigned int m0, m1;
ptrdiff_t offs = 0;
_unaligned_load64(m1, (uint32_t *)above);
m0 = vec_splat(m1, 0);
m1 = vec_splat(m1, 1);
#define NEXT vec_ste(m0, offs, (uint32_t *)dst); vec_ste(m1, 4+offs, (uint32_t *)dst); offs+=y_stride;
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
#undef NEXT
}
void vp9_h_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Expand sequence
// aa bb cc dd ee ff gg hh -- -- -- -- -- ....
// to
// aa aa aa aa bb bb bb bb cc cc cc cc dd ....
vector unsigned char c0;
vector unsigned int m0, m1, m2;
ptrdiff_t offs = 0;
_unaligned_load64(c0, (uint32_t *)left);
#define STORE(x) vec_ste(x, offs, (uint32_t *)dst); vec_ste(x, 4+offs, (uint32_t *)dst); offs+=y_stride;
m0 = (vector unsigned int)vec_splat(c0, 0);
m1 = (vector unsigned int)vec_splat(c0, 1);
STORE(m0)
m2 = (vector unsigned int)vec_splat(c0, 2);
STORE(m1)
m0 = (vector unsigned int)vec_splat(c0, 3);
STORE(m2)
m1 = (vector unsigned int)vec_splat(c0, 4);
STORE(m0)
m2 = (vector unsigned int)vec_splat(c0, 5);
STORE(m1)
m0 = (vector unsigned int)vec_splat(c0, 6);
STORE(m2)
m1 = (vector unsigned int)vec_splat(c0, 7);
STORE(m0)
STORE(m1)
#undef STORE
}
/* 16x16 */
void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
const uint8_t *above, const uint8_t *left)
{
@ -189,4 +470,485 @@ void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
NEXT
#undef NEXT
}
void _common_top_or_left_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
{
vector unsigned int m0, m1, m2, m3;
vector unsigned short s0, s1;
vector unsigned char c0;
ptrdiff_t offs = 0;
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned int vfour = vec_splat_u32(4);
vector unsigned int veight = vec_splat_u32(8);
m0 = vec_ld(0, (uint32_t *)what);
// The Intel version is identical to the full 16x16 except for
// zeroing out the additional vector; otherwise it does all the
// same computations. This is clearly wasteful, so I've elided
// them here. In particular, an SAD of zero against zero will
// always be zero, so we can just drop one of the SADs right now.
m1 = vec_sum4s((vector unsigned char)m0, vzero);
m2 = (vector unsigned int)vec_sum2s((vector signed int)m1, (vector signed int)vzero);
// Also, we don't need the full movhlps steps because the other vector
// will always be zero, so only one vector shift is required.
m3 = vec_sld(m2, vzero, 8);
m0 = vec_adds(m3, m2);
m1 = vec_adds(m0, veight);
m2 = vec_sra(m1, vfour);
s0 = vec_packsu(m2, m2);
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
// 16 stores
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
#undef NEXT
}
void vp9_dc_top_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_16x16_vmx(dst, y_stride, above);
}
void vp9_dc_left_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_16x16_vmx(dst, y_stride, left);
}
void vp9_dc_128_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Mmmm, 128 splat splat splat.
vector unsigned char c0 = vec_sl(vec_splat_u8(2), vec_splat_u8(6));
ptrdiff_t offs = 0;
// 16 stores
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
#undef NEXT
}
void vp9_v_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Mmmm, vector copy copy copy.
vector unsigned char c0 = vec_ld(0, above);
ptrdiff_t offs = 0;
// 16 stores
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
NEXT
#undef NEXT
}
void vp9_h_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Expand an entire 16-byte vector to 16 splatted vectors.
// Unfortunately, the load is not aligned, but the stores are.
vector unsigned char c0, c1, c2, c3;
ptrdiff_t offs = 0;
_unaligned_load128(c0, left);
// 16 stores
#define SPLAT(n,x) x = vec_splat(c0, n);
#define STORE(x) vec_st(x, offs, dst); offs += y_stride;
SPLAT(0,c1)
SPLAT(1,c2)
STORE(c1)
SPLAT(2,c3)
STORE(c2)
SPLAT(3,c1)
STORE(c3)
SPLAT(4,c2)
STORE(c1)
SPLAT(5,c3)
STORE(c2)
SPLAT(6,c1)
STORE(c3)
SPLAT(7,c2)
STORE(c1)
SPLAT(8,c3)
STORE(c2)
SPLAT(9,c1)
STORE(c3)
SPLAT(10,c2)
STORE(c1)
SPLAT(11,c3)
STORE(c2)
SPLAT(12,c1)
STORE(c3)
SPLAT(13,c2)
STORE(c1)
SPLAT(14,c3)
STORE(c2)
SPLAT(15,c1)
STORE(c3)
STORE(c1)
#undef STORE
#undef SPLAT
}
/* 32x32 */
void vp9_dc_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride,
const uint8_t *above, const uint8_t *left)
{
// Also aligned.
// Approximately the same routine, but double-pumped.
vector unsigned int m0, m1, m2, m3, m4, m5, m6, m7, m8;
vector unsigned short s0, s1;
vector unsigned char c0;
ptrdiff_t offs = 0;
vector unsigned int v32 = vec_splat_u32(8); // Computed momentarily
vector unsigned int vtwo = vec_splat_u32(2);
vector unsigned int vsix = vec_splat_u32(6);
vector unsigned int vzero = vec_splat_u32(0);
m1 = vec_ld(0, (uint32_t *)above);
m2 = vec_ld(16, (uint32_t *)above);
m3 = vec_ld(0, (uint32_t *)left);
m4 = vec_ld(16, (uint32_t *)left);
v32 = vec_sl(v32, vtwo);
m5 = vec_sum4s((vector unsigned char)m1, vzero);
m6 = vec_sum4s((vector unsigned char)m2, vzero);
m7 = vec_sum4s((vector unsigned char)m3, vzero);
m8 = vec_sum4s((vector unsigned char)m4, vzero);
m1 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
m2 = (vector unsigned int)vec_sum2s((vector signed int)m6, (vector signed int)vzero);
m3 = (vector unsigned int)vec_sum2s((vector signed int)m7, (vector signed int)vzero);
m4 = (vector unsigned int)vec_sum2s((vector signed int)m8, (vector signed int)vzero);
m5 = vec_adds(m1, m2);
m6 = vec_adds(m3, m4);
m0 = vec_adds(m5, m6);
m1 = vec_sld(m2, m2, 8);
m3 = vec_sld(m0, m1, 8);
m4 = vec_adds(m3, m0);
m1 = vec_adds(m4, v32);
m2 = vec_sra(m1, vsix);
s0 = vec_packsu(m2, m2);
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
// 32 stores
#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
#undef NEXT
}
void _common_top_or_left_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
{
// This actually has slightly different logic.
vector unsigned int m0, m1, m2, m3, m4, m5, m6;
vector unsigned short s0, s1;
vector unsigned char c0;
ptrdiff_t offs = 0;
vector unsigned int v16 = vec_splat_u32(8); // Computed momentarily
vector unsigned int vfive = vec_splat_u32(5);
vector unsigned int vzero = vec_splat_u32(0);
m0 = vec_ld(0, (uint32_t *)what);
m2 = vec_ld(16, (uint32_t *)what);
v16 = vec_add(v16, v16);
m5 = vec_sum4s((vector unsigned char)m0, vzero);
m6 = vec_sum4s((vector unsigned char)m2, vzero);
m4 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
m2 = (vector unsigned int)vec_sum2s((vector signed int)m6, (vector signed int)vzero);
m0 = vec_adds(m4, m2);
m1 = vec_sld(m2, m2, 8);
m3 = vec_sld(m0, m1, 8);
m4 = vec_adds(m3, m0);
m1 = vec_adds(m4, v16);
m2 = vec_sra(m1, vfive);
s0 = vec_packsu(m2, m2);
s1 = vec_splat(s0, 1);
c0 = vec_packsu(s1, s1);
// 32 stores
#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
#undef NEXT
}
void vp9_dc_top_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_32x32_vmx(dst, y_stride, above);
}
void vp9_dc_left_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
_common_top_or_left_32x32_vmx(dst, y_stride, left);
}
void vp9_dc_128_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Oh baby, I love to feel those creamy 128s ru... um, sorry.
// What were we doing again?
vector unsigned char c0 = vec_sl(vec_splat_u8(2), vec_splat_u8(6));
ptrdiff_t offs = 0;
// 32 stores
#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
#undef NEXT
}
void vp9_v_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Is it hot in here or is it just me?
// Oh, right, copying even more data.
vector unsigned char c0 = vec_ld(0, above);
vector unsigned char c1 = vec_ld(16, above);
ptrdiff_t offs = 0;
// 32 stores
#define NEXT vec_st(c0, offs, dst); vec_st(c1, offs+16, dst); offs += y_stride;
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
NEXT NEXT
#undef NEXT
}
void vp9_h_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
{
// Two 16-byte vectors to 32 splatted vectors.
// Again, the load doesn't seem to be aligned :(
// Simplest to run this as a double-pumped 16x16 routine.
vector unsigned char c0, c1, c2, c3;
ptrdiff_t offs = 0;
_unaligned_load128(c0, left);
// 32 stores
#define SPLAT(n,x) x = vec_splat(c0, n);
#define STORE(x) vec_st(x, offs, dst); vec_st(x, 16+offs, dst); offs += y_stride;
SPLAT(0,c1)
SPLAT(1,c2)
STORE(c1)
SPLAT(2,c3)
STORE(c2)
SPLAT(3,c1)
STORE(c3)
SPLAT(4,c2)
STORE(c1)
SPLAT(5,c3)
STORE(c2)
SPLAT(6,c1)
STORE(c3)
SPLAT(7,c2)
STORE(c1)
SPLAT(8,c3)
STORE(c2)
SPLAT(9,c1)
STORE(c3)
SPLAT(10,c2)
STORE(c1)
SPLAT(11,c3)
STORE(c2)
SPLAT(12,c1)
STORE(c3)
SPLAT(13,c2)
STORE(c1)
SPLAT(14,c3)
STORE(c2)
SPLAT(15,c1)
STORE(c3)
_unaligned_load128(c0, left + 16);
STORE(c1)
// 32 more stores
SPLAT(0,c1)
SPLAT(1,c2)
STORE(c1)
SPLAT(2,c3)
STORE(c2)
SPLAT(3,c1)
STORE(c3)
SPLAT(4,c2)
STORE(c1)
SPLAT(5,c3)
STORE(c2)
SPLAT(6,c1)
STORE(c3)
SPLAT(7,c2)
STORE(c1)
SPLAT(8,c3)
STORE(c2)
SPLAT(9,c1)
STORE(c3)
SPLAT(10,c2)
STORE(c1)
SPLAT(11,c3)
STORE(c2)
SPLAT(12,c1)
STORE(c3)
SPLAT(13,c2)
STORE(c1)
SPLAT(14,c3)
STORE(c2)
SPLAT(15,c1)
STORE(c3)
STORE(c1)
#undef STORE
#undef SPLAT
}

View File

@ -142,55 +142,68 @@ void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
void vp9_dc_128_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_vmx
void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_dc_128_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_vmx
void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
void vp9_dc_128_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_vmx
void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
void vp9_dc_128_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_vmx
void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
void vp9_dc_left_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_vmx
void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
void vp9_dc_left_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_vmx
void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
void vp9_dc_left_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_vmx
void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
void vp9_dc_left_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_vmx
void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_vmx
void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
void vp9_dc_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_vmx
void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_vmx
void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_vmx
void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
void vp9_dc_top_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_vmx
void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
void vp9_dc_top_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_vmx
void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
void vp9_dc_top_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_vmx
void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
void vp9_dc_top_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_vmx
int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
#define vp9_diamond_search_sad vp9_diamond_search_sad_c
@ -244,16 +257,20 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
#define vp9_fwht4x4 vp9_fwht4x4_c
void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
void vp9_h_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_vmx
void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
void vp9_h_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_vmx
void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
void vp9_h_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_vmx
void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
void vp9_h_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_vmx
void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
#define vp9_hadamard_16x16 vp9_hadamard_16x16_c
@ -477,16 +494,20 @@ void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abo
#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
void vp9_v_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_vmx
void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
void vp9_v_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_vmx
void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
void vp9_v_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_vmx
void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
void vp9_v_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_vmx
int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
#define vp9_vector_var vp9_vector_var_c