From 10268aa6da112f0df657b2f7cbe6bcbcf26264bb Mon Sep 17 00:00:00 2001
From: Cameron Kaiser <classilla@floodgap.com>
Date: Fri, 3 Nov 2017 20:34:22 -0700
Subject: [PATCH] #324: start work on intra frame prediction

---
 media/libvpx/moz.build                        |   1 +
 .../libvpx/vp9/common/ppc/vp9_intrapred_vmx.c | 192 ++++++++++++++++++
 media/libvpx/vp9_rtcd_tenfourfox_altivec.h    |   3 +
 3 files changed, 196 insertions(+)
 create mode 100644 media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c

diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build
index 0e1924c98..9ef1cdfa8 100644
--- a/media/libvpx/moz.build
+++ b/media/libvpx/moz.build
@@ -41,6 +41,7 @@ if CONFIG['VPX_VMX_ASM']:
 		'vp8/common/ppc/recon_altivec.s',
 		'vp9/common/ppc/vp9_convolve_vmx.c',
 		'vp9/common/ppc/vp9_idct_intrin_vmx.c',
+		'vp9/common/ppc/vp9_intrapred_vmx.c',
 	]
 
 arm_asm_files = []
diff --git a/media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c b/media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c
new file mode 100644
index 000000000..8efeed405
--- /dev/null
+++ b/media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c
@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2018 Cameron Kaiser and Contributors to TenFourFox
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#ifndef __ALTIVEC__
+#error VMX being compiled on non-VMX platform
+#else
+#include <altivec.h>
+#endif
+
+/* AltiVec-accelerated VP9 intra frame prediction for big-endian 32-bit PowerPC. */
+
+// v = vector, s = *uint32_t, vv = temporary vector (unsigned int)
+
+// Basic notion.
+#define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }
+
+// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.)
+#define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
+// Equivalent for _mm_cvtsi128_si32.
+#define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
+// Equivalent for _mm_loadl_epi64. Simplest just to make this a full load right now.
+#define _unaligned_load64(v,s) _unaligned_load128(v,s)
+// Equivalent for _mm_storel_epi64. Essentially acts as two store32s on different elements.
+#define _unaligned_store64(v,vv,s) {\
+	vv = vec_splat((vector unsigned int)v, 0); vec_ste(vv,0,s);\
+	vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
+}
+
+void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
+				const uint8_t *above, const uint8_t *left)
+{
+	// Descended from the MMX version, so unaligned. :(
+
+	vector unsigned int m0, m1, m2, m3, m4, m5;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+
+	vector unsigned int vzero = vec_splat_u32(0);
+	vector unsigned int vfour = vec_splat_u32(4);
+	vector unsigned int vthree = vec_splat_u32(3);
+	
+	_unaligned_load32(m0, (uint32_t *)above);
+	_unaligned_load32(m1, (uint32_t *)left);
+	m2 = vec_mergeh(m0, m1); // punpckldq
+	
+	// The Intel MMX version computes a sum of absolute differences
+	// against a vector of zero, so this is really just a cross sum.
+
+	m3 = vec_sum4s((vector unsigned char)m2, vzero);
+	m4 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
+	// Leave as 32-bit. Compute on that.
+	m0 = vec_add(m4, vfour);
+	m5 = vec_sra(m0, vthree);
+	
+	// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
+	s0 = vec_packsu(m5, m5);
+	// ENDIAN NOTE!
+	// We splat position *1* because we were working on the low-order 64 bits.
+	// Since our 32-bit result was in the higher word of the low 64 bits, it's
+	// index 1, and since we just shifted down, it's *still* index 1.
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+	
+	_unaligned_store32(c0, m0, (uint32_t *)dst);
+	_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride));
+	_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride));
+	_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride + y_stride));
+}
+
+void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
+				const uint8_t *above, const uint8_t *left)
+{
+	// Descended from the MMX version, so unaligned. :(
+
+	vector unsigned int m0, m1, m2, m3, m4, m5, m6;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+	ptrdiff_t offs = 0;
+
+	vector unsigned int vzero = vec_splat_u32(0);
+	vector unsigned int vfour = vec_splat_u32(4);
+	vector unsigned int veight = vec_splat_u32(8);
+
+	_unaligned_load64(m0, (uint32_t *)above);
+	_unaligned_load64(m1, (uint32_t *)left);
+	
+	// Same as above, an SAD calculation against a zero vector, but twice.
+	m3 = vec_sum4s((vector unsigned char)m0, vzero);
+	m5 = vec_sum4s((vector unsigned char)m1, vzero);
+	m4 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
+	m6 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
+	// Continue computations in 32-bit pending pack/splat/pack.
+	m1 = vec_adds(m4, m6);
+	m0 = vec_adds(m1, veight);
+	m5 = vec_sra(m0, vfour);
+	
+	// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
+	s0 = vec_packsu(m5, m5);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+	
+#define NEXT _unaligned_store64(c0, m0, (uint32_t *)(dst+offs)); offs+=y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
+				const uint8_t *above, const uint8_t *left)
+{
+	// Finally, alignment! The use of movdqa in the Intel SSE2 version
+	// for both loads and stores implies we can safely use aligned 
+	// loads and stores here as well.
+
+	vector unsigned int m0, m1, m2, m3, m4, m5, m6;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+	ptrdiff_t offs = 0;
+
+	vector unsigned int v16 = vec_splat_u32(8); // Computed momentarily
+	vector unsigned int vone = vec_splat_u32(1);
+	vector unsigned int vzero = vec_splat_u32(0);
+	vector unsigned int vfive = vec_splat_u32(5);
+	
+	m0 = vec_ld(0, (uint32_t *)above);
+	m1 = vec_ld(0, (uint32_t *)left);
+	
+	// The SSE2 version starts using 32-bit words, as we do.
+	m2 = vec_sum4s((vector unsigned char)m0, vzero);
+	m3 = vec_sum4s((vector unsigned char)m1, vzero);
+	m2 = (vector unsigned int)vec_sum2s((vector signed int)m2, (vector signed int)vzero);
+	m3 = (vector unsigned int)vec_sum2s((vector signed int)m3, (vector signed int)vzero);
+
+	v16 = vec_sl(v16, vone);
+	m4 = vec_adds(m2, m3);
+
+	// Combine 64 bits of m3 with m4 (equivalent to movhlps).
+	m5 = vec_sld(m3, m3, 8);
+	m6 = vec_sld(m4, m5, 8);
+
+	m0 = vec_adds(m4, m6);
+	m1 = vec_adds(m0, v16);
+	m2 = vec_sra(m1, vfive);
+	
+	// Pack to 16 bits, splat the short, and pack again to yield 8 bits.
+	s0 = vec_packsu(m2, m2);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+
+	// 16 stores
+#define NEXT vec_st(c0, offs, dst); offs += y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
\ No newline at end of file
diff --git a/media/libvpx/vp9_rtcd_tenfourfox_altivec.h b/media/libvpx/vp9_rtcd_tenfourfox_altivec.h
index 7564e7901..42da1e686 100644
--- a/media/libvpx/vp9_rtcd_tenfourfox_altivec.h
+++ b/media/libvpx/vp9_rtcd_tenfourfox_altivec.h
@@ -166,15 +166,18 @@ void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t
 #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
 
 void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
 
 void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
 
 void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
 
 void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
 
 void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);