mirror of
https://github.com/classilla/tenfourfox.git
synced 2025-01-12 20:30:18 +00:00
#324: start work on intra frame prediction
This commit is contained in:
parent
35ce197aff
commit
10268aa6da
@ -41,6 +41,7 @@ if CONFIG['VPX_VMX_ASM']:
|
||||
'vp8/common/ppc/recon_altivec.s',
|
||||
'vp9/common/ppc/vp9_convolve_vmx.c',
|
||||
'vp9/common/ppc/vp9_idct_intrin_vmx.c',
|
||||
'vp9/common/ppc/vp9_intrapred_vmx.c',
|
||||
]
|
||||
|
||||
arm_asm_files = []
|
||||
|
192
media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c
Normal file
192
media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c
Normal file
@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Cameron Kaiser and Contributors to TenFourFox
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#ifndef __ALTIVEC__
|
||||
#error VMX being compiled on non-VMX platform
|
||||
#else
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
/* AltiVec-accelerated VP9 intra frame prediction for big-endian 32-bit PowerPC. */
|
||||
|
||||
// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)
|
||||
|
||||
// Basic notion.
|
||||
#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }
|
||||
|
||||
// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.)
|
||||
#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
|
||||
// Equivalent for _mm_cvtsi128_si32.
|
||||
#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
|
||||
// Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now.
|
||||
#define _unaligned_load64(v,s) _unaligned_load128(v,s)
|
||||
// Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements.
|
||||
#define _unaligned_store64(v,vv,s) {\
|
||||
vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\
|
||||
vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
|
||||
}
|
||||
|
||||
void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
||||
const uint8_t *above, const uint8_t *left)
|
||||
{
|
||||
// Descended from the MMX version, so unaligned. :(
|
||||
|
||||
vector unsigned int m0, m1, m2, m3, m4, m5;
|
||||
vector unsigned short s0, s1;
|
||||
vector unsigned char c0;
|
||||
|
||||
vector unsigned int vzero = vec_splat_u32(0);
|
||||
vector unsigned int vfour = vec_splat_u32(4);
|
||||
vector unsigned int vthree = vec_splat_u32(3);
|
||||
|
||||
_unaligned_load32(m0, (uint32_t *)above);
|
||||
_unaligned_load32(m1, (uint32_t *)left);
|
||||
m2 = vec_mergeh(m0, m1); // punpckldq
|
||||
|
||||
// The Intel MMX version computes a sum of absolute differences
|
||||
// against a vector of zero, so this is really just a cross sum.
|
||||
|
||||
m3 = vec_sum4s((vector unsigned char)m2, vzero);
|
||||
m4 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
|
||||
// Leave as 32-bit. Compute on that.
|
||||
m0 = vec_add(m4, vfour);
|
||||
m5 = vec_sra(m0, vthree);
|
||||
|
||||
// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
|
||||
s0 = vec_packsu(m5, m5);
|
||||
// ENDIAN NOTE!
|
||||
// We splat position *1* because we were working on the low-order 64 bits.
|
||||
// Since our 32-bit result was in the higher word of the low 64 bits, it's
|
||||
// index 1, and since we just shifted down, it's *still* index 1.
|
||||
s1 = vec_splat(s0, 1);
|
||||
c0 = vec_packsu(s1, s1);
|
||||
|
||||
_unaligned_store32(c0, m0, (uint32_t *)dst);
|
||||
_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride));
|
||||
_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride));
|
||||
_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride + y_stride));
|
||||
}
|
||||
|
||||
void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
||||
const uint8_t *above, const uint8_t *left)
|
||||
{
|
||||
// Descended from the MMX version, so unaligned. :(
|
||||
|
||||
vector unsigned int m0, m1, m2, m3, m4, m5, m6;
|
||||
vector unsigned short s0, s1;
|
||||
vector unsigned char c0;
|
||||
ptrdiff_t offs = 0;
|
||||
|
||||
vector unsigned int vzero = vec_splat_u32(0);
|
||||
vector unsigned int vfour = vec_splat_u32(4);
|
||||
vector unsigned int veight = vec_splat_u32(8);
|
||||
|
||||
_unaligned_load64(m0, (uint32_t *)above);
|
||||
_unaligned_load64(m1, (uint32_t *)left);
|
||||
|
||||
// Same as above, an SAD calculation against a zero vector, but twice.
|
||||
m3 = vec_sum4s((vector unsigned char)m0, vzero);
|
||||
m5 = vec_sum4s((vector unsigned char)m1, vzero);
|
||||
m4 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
|
||||
m6 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
|
||||
// Continue computations in 32-bit pending pack/splat/pack.
|
||||
m1 = vec_adds(m4, m6);
|
||||
m0 = vec_adds(m1, veight);
|
||||
m5 = vec_sra(m0, vfour);
|
||||
|
||||
// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
|
||||
s0 = vec_packsu(m5, m5);
|
||||
s1 = vec_splat(s0, 1);
|
||||
c0 = vec_packsu(s1, s1);
|
||||
|
||||
#define NEXT _unaligned_store64(c0, m0, (uint32_t *)(dst+offs)); offs+=y_stride;
|
||||
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
|
||||
#undef NEXT
|
||||
}
|
||||
|
||||
void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
|
||||
const uint8_t *above, const uint8_t *left)
|
||||
{
|
||||
// Finally, alignment! The use of movdqa in the Intel SSE2 version
|
||||
// for both loads and stores implies we can safely use aligned
|
||||
// loads and stores here as well.
|
||||
|
||||
vector unsigned int m0, m1, m2, m3, m4, m5, m6;
|
||||
vector unsigned short s0, s1;
|
||||
vector unsigned char c0;
|
||||
ptrdiff_t offs = 0;
|
||||
|
||||
vector unsigned int v16 = vec_splat_u32(8); // Computed momentarily
|
||||
vector unsigned int vone = vec_splat_u32(1);
|
||||
vector unsigned int vzero = vec_splat_u32(0);
|
||||
vector unsigned int vfive = vec_splat_u32(5);
|
||||
|
||||
m0 = vec_ld(0, (uint32_t *)above);
|
||||
m1 = vec_ld(0, (uint32_t *)left);
|
||||
|
||||
// The SSE2 version starts using 32-bit words, as we do.
|
||||
m2 = vec_sum4s((vector unsigned char)m0, vzero);
|
||||
m3 = vec_sum4s((vector unsigned char)m1, vzero);
|
||||
m2 = (vector unsigned int)vec_sum2s((vector signed int)m2, (vector signed int)vzero);
|
||||
m3 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
|
||||
|
||||
v16 = vec_sl(v16, vone);
|
||||
m4 = vec_adds(m2, m3);
|
||||
|
||||
// Combine 64 bits of m3 with m4 (equivalent to movhlps).
|
||||
m5 = vec_sld(m3, m3, 8);
|
||||
m6 = vec_sld(m4, m5, 8);
|
||||
|
||||
m0 = vec_adds(m4, m6);
|
||||
m1 = vec_adds(m0, v16);
|
||||
m2 = vec_sra(m1, vfive);
|
||||
|
||||
// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
|
||||
s0 = vec_packsu(m2, m2);
|
||||
s1 = vec_splat(s0, 1);
|
||||
c0 = vec_packsu(s1, s1);
|
||||
|
||||
// 16 stores
|
||||
#define NEXT vec_st(c0, offs, dst); offs += y_stride;
|
||||
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
NEXT
|
||||
|
||||
#undef NEXT
|
||||
}
|
@ -166,15 +166,18 @@ void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t
|
||||
#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
|
||||
|
||||
void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
|
||||
|
||||
void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
|
||||
|
||||
void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
|
||||
|
||||
void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
|
||||
|
||||
void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
|
||||
|
Loading…
x
Reference in New Issue
Block a user