#324: complete intra predictors for h/v/dc/128

2025-02-15 03:31:24 +00:00 · 2017-11-22 17:27:20 -08:00 · 2017-11-22 17:27:20 -08:00 · 7bfd387acb
commit 7bfd387acb
parent f6c2519506
2 changed files with 815 additions and 32 deletions
--- a/media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c
+++ b/media/libvpx/vp9/common/ppc/vp9_intrapred_vmx.c
@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2018 Cameron Kaiser and Contributors to TenFourFox
+ *  Copyright (c) 2017 Cameron Kaiser and Contributors to TenFourFox
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@ -21,10 +21,12 @@

 // v = vector, s = *uint32_t, vv = temporary vector (unsigned int)

-// Basic notion.
+// Basic notion. Guaranteed to work at any offset.
 #define _unaligned_load128(v,s) { v=vec_perm(vec_ld(0,s),vec_ld(16,s),vec_lvsl(0,s)); }

-// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.)
+// Equivalent for _mm_cvtsi32_si128. (Upper 96 bits undefined.) However,
+// this may have issues loading at really weird addresses if they're not
+// minimally word-aligned.
 #define _unaligned_load32(v,s) { v=vec_lde(0,s); v=vec_perm(v,v,vec_lvsl(0,s)); }
 // Equivalent for _mm_cvtsi128_si32.
 #define _unaligned_store32(v,vv,s) { vv=vec_splat((vector unsigned int)v,0); vec_ste(vv,0,s); }
@ -36,6 +38,8 @@
 	vv = vec_splat((vector unsigned int)v, 1); vec_ste(vv,4,s);\
 }

+/* 4x4 */
+
 void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
 				const uint8_t *above, const uint8_t *left)
 {
@ -71,12 +75,160 @@ void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride,
 	s1 = vec_splat(s0, 1);
 	c0 = vec_packsu(s1, s1);
 	
-	_unaligned_store32(c0, m0, (uint32_t *)dst);
-	_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride));
-	_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride));
-	_unaligned_store32(c0, m0, (uint32_t *)(dst + y_stride + y_stride + y_stride));
+	// This is faster than a whole bunch of _unaligned_store32s because we
+	// already splatted the vector, so it's the same at all positions.
+	vec_ste((vector unsigned int)c0, 0, (uint32_t *)dst);
+	vec_ste((vector unsigned int)c0, y_stride, (uint32_t *)dst);
+	vec_ste((vector unsigned int)c0, y_stride + y_stride, (uint32_t *)dst);
+	vec_ste((vector unsigned int)c0, y_stride + y_stride + y_stride, (uint32_t *)dst);
 }

+inline void _common_top_or_left_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
+{
+	// Similar idea to the standard predictor, but one column only.
+	vector unsigned int m0, m1, m2, m3;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+
+	vector unsigned int vzero = vec_splat_u32(0);
+	vector unsigned int vtwo = vec_splat_u32(2);
+	
+	_unaligned_load32(m1, (uint32_t *)what);
+	// Interpolate zero to clear out the upper bits so that we get a
+	// proper cross-sum over the full range.
+	m0 = vec_mergeh(m1, vzero);
+	m2 = vec_sum4s((vector unsigned char)m0, vzero);
+	m3 = (vector unsigned int)vec_sum2s((vector signed int)m2, (vector signed int)vzero);
+	m0 = vec_add(m3, vtwo);
+	m1 = vec_sra(m0, vtwo);
+	
+	s0 = vec_packsu(m1, m1);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+	
+	vec_ste((vector unsigned int)c0, 0, (uint32_t *)dst);
+	vec_ste((vector unsigned int)c0, y_stride, (uint32_t *)dst);
+	vec_ste((vector unsigned int)c0, y_stride + y_stride, (uint32_t *)dst);
+	vec_ste((vector unsigned int)c0, y_stride + y_stride + y_stride, (uint32_t *)dst);
+}
+
+void vp9_dc_top_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_4x4_vmx(dst, y_stride, above);
+}
+void vp9_dc_left_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left) 
+{
+	_common_top_or_left_4x4_vmx(dst, y_stride, left);
+}
+
+void vp9_dc_128_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Pretty much just blit 128s.
+	// Splatting to a char vector doesn't store properly, so splat
+	// to int.
+	vector unsigned int m0 = (vector unsigned int)vec_sl(vec_splat_u8(2), vec_splat_u8(6));
+	vec_ste(m0, 0, (uint32_t *)dst);
+	vec_ste(m0, y_stride, (uint32_t *)dst);
+	vec_ste(m0, y_stride + y_stride, (uint32_t *)dst);
+	vec_ste(m0, y_stride + y_stride + y_stride, (uint32_t *)dst);
+}
+
+void vp9_v_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Pretty much just copy.
+		
+	vector unsigned int m0, m1;
+	_unaligned_load32(m1, (uint32_t *)above);
+	m0 = vec_splat(m1, 0);
+	vec_ste(m0, 0, (uint32_t *)dst);
+	vec_ste(m0, y_stride, (uint32_t *)dst);
+	vec_ste(m0, y_stride + y_stride, (uint32_t *)dst);
+	vec_ste(m0, y_stride + y_stride + y_stride, (uint32_t *)dst);
+}
+
+void vp9_h_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Expand sequence
+	// aa bb cc dd -- -- -- -- -- ....
+	// to
+	// aa aa aa aa bb bb bb bb cc ....
+	// This can be done with just splats.
+
+	vector unsigned char c0;
+	vector unsigned int m0, m1, m2, m3;
+	vector unsigned char vzero = vec_splat_u8(0);
+	
+	_unaligned_load32(c0, (uint32_t *)left);
+
+	m0 = (vector unsigned int)vec_splat(c0, 0);
+	m1 = (vector unsigned int)vec_splat(c0, 1);
+	vec_ste(m0, 0, (uint32_t *)dst);
+	m2 = (vector unsigned int)vec_splat(c0, 2);
+	vec_ste(m1, y_stride, (uint32_t *)dst);
+	m3 = (vector unsigned int)vec_splat(c0, 3);
+	vec_ste(m2, y_stride + y_stride, (uint32_t *)dst);
+	vec_ste(m3, y_stride + y_stride + y_stride, (uint32_t *)dst);
+}
+
+#if(0)
+// This doesn't work properly, and the large amount of unaligned
+// memory access in the True Motion predictors makes them a poor
+// fit for AltiVec.
+void vp9_tm_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Get the last top value from above - 1 and splat it ("tl").
+	// Using that preceding byte, compute t[i]-tl+l[i] for the
+	// appropriate block size where t = top and l = left with our
+	// tl vector above.
+	
+	vector unsigned char c0, c1, c2, c3, c4;
+	vector unsigned short s0, s1, s2, s3, s4, s5, s6, tl;
+	vector unsigned int m0;
+	vector unsigned char vzero = vec_splat_u8(0);
+	ptrdiff_t offs = 0;
+
+	// This can load at really weird addresses, so our
+	// faster unaligned load32 macro is not sufficient.
+	_unaligned_load128(c0, (uint32_t *)(above - 1));
+	c1 = vec_splat(c0, 0);
+	tl = vec_mergeh(vzero, c1);
+	
+	// Expand t to short and subtract tl.
+	_unaligned_load128(c0, (uint32_t *)above);
+	s1 = vec_mergeh(vzero, c0);
+	s0 = vec_sub(s1, tl);
+
+#define TM_2X2(x) \
+	_unaligned_load128(c2, (uint32_t *)(left + 4 - x)); \
+	_unaligned_load128(c3, (uint32_t *)(left + 5 - x)); \
+	s2 = vec_mergeh(vzero, c2); \
+	s3 = vec_mergeh(vzero, c3); \
+	s4 = vec_splat(s2, 0); \
+	s5 = vec_splat(s3, 0); \
+	s2 = vec_add(s0, s4); \
+	s3 = vec_add(s0, s5); \
+	c2 = vec_packsu(s2, s2); \
+	c3 = vec_packsu(s3, s3); \
+	c4 = vec_perm(c2, c2, vec_lvsr(0, (uint32_t *)(dst + offs))); \
+	vec_ste(c4, 0, (dst + offs)); \
+	vec_ste(c4, 1, (dst + offs)); \
+	vec_ste(c4, 2, (dst + offs)); \
+	vec_ste(c4, 3, (dst + offs)); \
+	c4 = vec_perm(c3, c3, vec_lvsr(0, (uint32_t *)(dst + offs + y_stride))); \
+	vec_ste(c4, 0, (dst + offs + y_stride)); \
+	vec_ste(c4, 1, (dst + offs + y_stride)); \
+	vec_ste(c4, 2, (dst + offs + y_stride)); \
+	vec_ste(c4, 3, (dst + offs + y_stride)); \
+	offs += y_stride + y_stride;
+	
+	TM_2X2(4)
+	TM_2X2(2)
+#undef  TM_2X2
+}
+#endif
+
+/* 8x8 */
+
 void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
 				const uint8_t *above, const uint8_t *left)
 {
@ -109,7 +261,9 @@ void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
 	s1 = vec_splat(s0, 1);
 	c0 = vec_packsu(s1, s1);
 	
-#define NEXT _unaligned_store64(c0, m0, (uint32_t *)(dst+offs)); offs+=y_stride;
+	// Again, faster than repeated _unaligned_store64s since we
+	// already splatted.
+#define NEXT vec_ste((vector unsigned int)c0, offs, (uint32_t *)dst); vec_ste((vector unsigned int)c0, 4+offs, (uint32_t *)dst); offs+=y_stride;

 	NEXT
 	NEXT
@ -123,6 +277,133 @@ void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride,
 #undef  NEXT
 }

+inline void _common_top_or_left_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
+{
+	// Again, single column variant.
+	vector unsigned int m0, m1, m2, m3, m4;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+	ptrdiff_t offs = 0;
+
+	vector unsigned int vzero = vec_splat_u32(0);
+	vector unsigned int vfour = vec_splat_u32(4);
+	vector unsigned int vthree = vec_splat_u32(3);
+
+	_unaligned_load64(m0, (uint32_t *)what);
+	
+	// Since all these functions from here load at least 64 bits, we don't need
+	// to interpolate zero anymore to clear out the other half for the SAD.
+	m1 = vec_sum4s((vector unsigned char)m0, vzero);
+	m2 = (vector unsigned int)vec_sum2s((vector signed int)m1, (vector signed int)vzero);
+	m3 = vec_adds(m2, vfour);
+	m4 = vec_sra(m3, vthree);
+
+	s0 = vec_packsu(m4, m4);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+	
+#define NEXT vec_ste((vector unsigned int)c0, offs, (uint32_t *)dst); vec_ste((vector unsigned int)c0, 4+offs, (uint32_t *)dst); offs+=y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_dc_top_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_8x8_vmx(dst, y_stride, above);
+}
+void vp9_dc_left_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_8x8_vmx(dst, y_stride, left);
+}
+
+void vp9_dc_128_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Yup, blitting 128s again.
+	vector unsigned int m0 = (vector unsigned int)vec_sl(vec_splat_u8(2), vec_splat_u8(6));
+	ptrdiff_t offs = 0;
+
+#define NEXT vec_ste(m0, offs, (uint32_t *)dst); vec_ste(m0, 4+offs, (uint32_t *)dst); offs+=y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_v_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Yup, copying again.
+	vector unsigned int m0, m1;
+	ptrdiff_t offs = 0;
+
+	_unaligned_load64(m1, (uint32_t *)above);
+	m0 = vec_splat(m1, 0);
+	m1 = vec_splat(m1, 1);
+
+#define NEXT vec_ste(m0, offs, (uint32_t *)dst); vec_ste(m1, 4+offs, (uint32_t *)dst); offs+=y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_h_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Expand sequence
+	// aa bb cc dd ee ff gg hh -- -- -- -- -- ....
+	// to
+	// aa aa aa aa bb bb bb bb cc cc cc cc dd ....
+	
+	vector unsigned char c0;
+	vector unsigned int m0, m1, m2;
+	ptrdiff_t offs = 0;
+	
+	_unaligned_load64(c0, (uint32_t *)left);
+	
+#define STORE(x) vec_ste(x, offs, (uint32_t *)dst); vec_ste(x, 4+offs, (uint32_t *)dst); offs+=y_stride;
+	m0 = (vector unsigned int)vec_splat(c0, 0);
+	m1 = (vector unsigned int)vec_splat(c0, 1);
+	STORE(m0)
+	m2 = (vector unsigned int)vec_splat(c0, 2);
+	STORE(m1)
+	m0 = (vector unsigned int)vec_splat(c0, 3);
+	STORE(m2)
+	m1 = (vector unsigned int)vec_splat(c0, 4);
+	STORE(m0)
+	m2 = (vector unsigned int)vec_splat(c0, 5);
+	STORE(m1)
+	m0 = (vector unsigned int)vec_splat(c0, 6);
+	STORE(m2)
+	m1 = (vector unsigned int)vec_splat(c0, 7);
+	STORE(m0)
+	STORE(m1)
+#undef STORE
+}
+
+/* 16x16 */
+
 void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
 				const uint8_t *above, const uint8_t *left)
 {
@ -189,4 +470,485 @@ void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride,
 	NEXT

 #undef  NEXT
+}
+
+void _common_top_or_left_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
+{
+	vector unsigned int m0, m1, m2, m3;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+	ptrdiff_t offs = 0;
+
+	vector unsigned int vzero = vec_splat_u32(0);
+	vector unsigned int vfour = vec_splat_u32(4);
+	vector unsigned int veight = vec_splat_u32(8);
+	
+	m0 = vec_ld(0, (uint32_t *)what);
+	
+	// The Intel version is identical to the full 16x16 except for
+	// zeroing out the additional vector; otherwise it does all the
+	// same computations. This is clearly wasteful, so I've elided
+	// them here. In particular, an SAD of zero against zero will
+	// always be zero, so we can just drop one of the SADs right now.
+	m1 = vec_sum4s((vector unsigned char)m0, vzero);
+	m2 = (vector unsigned int)vec_sum2s((vector signed int)m1, (vector signed int)vzero);
+
+	// Also, we don't need the full movhlps steps because the other vector
+	// will always be zero, so only one vector shift is required.
+	m3 = vec_sld(m2, vzero, 8);
+
+	m0 = vec_adds(m3, m2);
+	m1 = vec_adds(m0, veight);
+	m2 = vec_sra(m1, vfour);
+	
+	s0 = vec_packsu(m2, m2);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+
+	// 16 stores
+#define NEXT vec_st(c0, offs, dst); offs += y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_dc_top_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_16x16_vmx(dst, y_stride, above);
+}
+void vp9_dc_left_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_16x16_vmx(dst, y_stride, left);
+}
+
+void vp9_dc_128_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Mmmm, 128 splat splat splat.
+	vector unsigned char c0 = vec_sl(vec_splat_u8(2), vec_splat_u8(6));
+	ptrdiff_t offs = 0;
+
+	// 16 stores
+#define NEXT vec_st(c0, offs, dst); offs += y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_v_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Mmmm, vector copy copy copy.
+	vector unsigned char c0 = vec_ld(0, above);
+	ptrdiff_t offs = 0;
+
+	// 16 stores
+#define NEXT vec_st(c0, offs, dst); offs += y_stride;
+
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+	
+	NEXT
+	NEXT
+	NEXT
+	NEXT
+
+#undef  NEXT
+}
+
+void vp9_h_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Expand an entire 16-byte vector to 16 splatted vectors.
+	// Unfortunately, the load is not aligned, but the stores are.
+	vector unsigned char c0, c1, c2, c3;
+	ptrdiff_t offs = 0;
+	
+	_unaligned_load128(c0, left);
+
+	// 16 stores
+#define SPLAT(n,x) x = vec_splat(c0, n);
+#define STORE(x) vec_st(x, offs, dst); offs += y_stride;
+
+	SPLAT(0,c1)
+	SPLAT(1,c2)
+	STORE(c1)
+	SPLAT(2,c3)
+	STORE(c2)
+	SPLAT(3,c1)
+	STORE(c3)
+	SPLAT(4,c2)
+	STORE(c1)
+	SPLAT(5,c3)
+	STORE(c2)
+	SPLAT(6,c1)
+	STORE(c3)
+	SPLAT(7,c2)
+	STORE(c1)
+	SPLAT(8,c3)
+	STORE(c2)
+	SPLAT(9,c1)
+	STORE(c3)
+	SPLAT(10,c2)
+	STORE(c1)
+	SPLAT(11,c3)
+	STORE(c2)
+	SPLAT(12,c1)
+	STORE(c3)
+	SPLAT(13,c2)
+	STORE(c1)
+	SPLAT(14,c3)
+	STORE(c2)
+	SPLAT(15,c1)
+	STORE(c3)
+	STORE(c1)
+
+#undef STORE
+#undef SPLAT
+}
+
+/* 32x32 */
+
+void vp9_dc_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride,
+				const uint8_t *above, const uint8_t *left)
+{
+	// Also aligned.
+	// Approximately the same routine, but double-pumped.
+
+	vector unsigned int m0, m1, m2, m3, m4, m5, m6, m7, m8;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+	ptrdiff_t offs = 0;
+
+	vector unsigned int v32 = vec_splat_u32(8); // Computed momentarily
+	vector unsigned int vtwo = vec_splat_u32(2);
+	vector unsigned int vsix = vec_splat_u32(6);
+	vector unsigned int vzero = vec_splat_u32(0);
+
+	m1 = vec_ld(0,  (uint32_t *)above);
+	m2 = vec_ld(16, (uint32_t *)above);
+	m3 = vec_ld(0,  (uint32_t *)left);
+	m4 = vec_ld(16, (uint32_t *)left);
+	
+	v32 = vec_sl(v32, vtwo);
+	
+	m5 = vec_sum4s((vector unsigned char)m1, vzero);
+	m6 = vec_sum4s((vector unsigned char)m2, vzero);
+	m7 = vec_sum4s((vector unsigned char)m3, vzero);
+	m8 = vec_sum4s((vector unsigned char)m4, vzero);
+	
+	m1 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
+	m2 = (vector unsigned int)vec_sum2s((vector signed int)m6, (vector signed int)vzero);
+	m3 = (vector unsigned int)vec_sum2s((vector signed int)m7, (vector signed int)vzero);
+	m4 = (vector unsigned int)vec_sum2s((vector signed int)m8, (vector signed int)vzero);
+
+	m5 = vec_adds(m1, m2);
+	m6 = vec_adds(m3, m4);
+	m0 = vec_adds(m5, m6);
+	
+	m1 = vec_sld(m2, m2, 8);
+	m3 = vec_sld(m0, m1, 8);
+	
+	m4 = vec_adds(m3, m0);
+	m1 = vec_adds(m4, v32);
+	m2 = vec_sra(m1, vsix);
+	
+	s0 = vec_packsu(m2, m2);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+	
+	// 32 stores
+#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
+
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+
+#undef  NEXT
+}
+
+void _common_top_or_left_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *what)
+{
+	// This actually has slightly different logic.
+	vector unsigned int m0, m1, m2, m3, m4, m5, m6;
+	vector unsigned short s0, s1;
+	vector unsigned char c0;
+	ptrdiff_t offs = 0;
+
+	vector unsigned int v16 = vec_splat_u32(8); // Computed momentarily
+	vector unsigned int vfive = vec_splat_u32(5);
+	vector unsigned int vzero = vec_splat_u32(0);
+
+	m0 = vec_ld(0,  (uint32_t *)what);
+	m2 = vec_ld(16, (uint32_t *)what);
+	
+	v16 = vec_add(v16, v16);
+	
+	m5 = vec_sum4s((vector unsigned char)m0, vzero);
+	m6 = vec_sum4s((vector unsigned char)m2, vzero);	
+	m4 = (vector unsigned int)vec_sum2s((vector signed int)m5, (vector signed int)vzero);
+	m2 = (vector unsigned int)vec_sum2s((vector signed int)m6, (vector signed int)vzero);
+
+	m0 = vec_adds(m4, m2);
+	
+	m1 = vec_sld(m2, m2, 8);
+	m3 = vec_sld(m0, m1, 8);
+	
+	m4 = vec_adds(m3, m0);
+	m1 = vec_adds(m4, v16);
+	m2 = vec_sra(m1, vfive);
+	
+	s0 = vec_packsu(m2, m2);
+	s1 = vec_splat(s0, 1);
+	c0 = vec_packsu(s1, s1);
+	
+	// 32 stores
+#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
+
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+
+#undef  NEXT
+}
+
+void vp9_dc_top_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_32x32_vmx(dst, y_stride, above);
+}
+void vp9_dc_left_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	_common_top_or_left_32x32_vmx(dst, y_stride, left);
+}
+
+void vp9_dc_128_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Oh baby, I love to feel those creamy 128s ru... um, sorry.
+	// What were we doing again?
+	vector unsigned char c0 = vec_sl(vec_splat_u8(2), vec_splat_u8(6));
+	ptrdiff_t offs = 0;
+
+	// 32 stores
+#define NEXT vec_st(c0, offs, dst); vec_st(c0, offs+16, dst); offs += y_stride;
+
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+
+#undef  NEXT
+}
+
+void vp9_v_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Is it hot in here or is it just me?
+	// Oh, right, copying even more data.
+	vector unsigned char c0 = vec_ld(0,  above);
+	vector unsigned char c1 = vec_ld(16, above);
+	ptrdiff_t offs = 0;
+
+	// 32 stores
+#define NEXT vec_st(c0, offs, dst); vec_st(c1, offs+16, dst); offs += y_stride;
+
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+	NEXT NEXT
+
+#undef  NEXT
+}
+
+void vp9_h_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left)
+{
+	// Two 16-byte vectors to 32 splatted vectors.
+	// Again, the load doesn't seem to be aligned :(
+	// Simplest to run this as a double-pumped 16x16 routine.
+	vector unsigned char c0, c1, c2, c3;
+	ptrdiff_t offs = 0;
+	
+	_unaligned_load128(c0, left);
+
+	// 32 stores
+#define SPLAT(n,x) x = vec_splat(c0, n);
+#define STORE(x) vec_st(x, offs, dst); vec_st(x, 16+offs, dst); offs += y_stride;
+
+	SPLAT(0,c1)
+	SPLAT(1,c2)
+	STORE(c1)
+	SPLAT(2,c3)
+	STORE(c2)
+	SPLAT(3,c1)
+	STORE(c3)
+	SPLAT(4,c2)
+	STORE(c1)
+	SPLAT(5,c3)
+	STORE(c2)
+	SPLAT(6,c1)
+	STORE(c3)
+	SPLAT(7,c2)
+	STORE(c1)
+	SPLAT(8,c3)
+	STORE(c2)
+	SPLAT(9,c1)
+	STORE(c3)
+	SPLAT(10,c2)
+	STORE(c1)
+	SPLAT(11,c3)
+	STORE(c2)
+	SPLAT(12,c1)
+	STORE(c3)
+	SPLAT(13,c2)
+	STORE(c1)
+	SPLAT(14,c3)
+	STORE(c2)
+	SPLAT(15,c1)
+	STORE(c3)
+	_unaligned_load128(c0, left + 16);
+	STORE(c1)
+	
+	// 32 more stores
+	SPLAT(0,c1)
+	SPLAT(1,c2)
+	STORE(c1)
+	SPLAT(2,c3)
+	STORE(c2)
+	SPLAT(3,c1)
+	STORE(c3)
+	SPLAT(4,c2)
+	STORE(c1)
+	SPLAT(5,c3)
+	STORE(c2)
+	SPLAT(6,c1)
+	STORE(c3)
+	SPLAT(7,c2)
+	STORE(c1)
+	SPLAT(8,c3)
+	STORE(c2)
+	SPLAT(9,c1)
+	STORE(c3)
+	SPLAT(10,c2)
+	STORE(c1)
+	SPLAT(11,c3)
+	STORE(c2)
+	SPLAT(12,c1)
+	STORE(c3)
+	SPLAT(13,c2)
+	STORE(c1)
+	SPLAT(14,c3)
+	STORE(c2)
+	SPLAT(15,c1)
+	STORE(c3)
+	STORE(c1)
+
+#undef STORE
+#undef SPLAT
 }
--- a/media/libvpx/vp9_rtcd_tenfourfox_altivec.h
+++ b/media/libvpx/vp9_rtcd_tenfourfox_altivec.h
@ -142,55 +142,68 @@ void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
 #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c

 void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c
+void vp9_dc_128_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_vmx

 void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
+void vp9_dc_128_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_vmx

 void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c
+void vp9_dc_128_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_vmx

 void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c
+void vp9_dc_128_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_vmx

 void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c
+void vp9_dc_left_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_vmx

 void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c
+void vp9_dc_left_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_vmx

 void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c
+void vp9_dc_left_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_vmx

 void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c
+void vp9_dc_left_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_vmx

 void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vp9_dc_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_vmx

 void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c
+void vp9_dc_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_vmx

 void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vp9_dc_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_vmx

 void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 void vp9_dc_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_vmx

 void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
+void vp9_dc_top_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_vmx

 void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c
+void vp9_dc_top_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_vmx

 void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
+void vp9_dc_top_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_vmx

 void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
+void vp9_dc_top_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_vmx

 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
@ -244,16 +257,20 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c

 void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+void vp9_h_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_vmx

 void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+void vp9_h_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_vmx

 void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
+void vp9_h_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_vmx

 void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+void vp9_h_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_vmx

 void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, int16_t *coeff);
 #define vp9_hadamard_16x16 vp9_hadamard_16x16_c
@ -477,16 +494,20 @@ void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abo
 #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c

 void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c
+void vp9_v_predictor_16x16_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_vmx

 void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c
+void vp9_v_predictor_32x32_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_vmx

 void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
+void vp9_v_predictor_4x4_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_vmx

 void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
+void vp9_v_predictor_8x8_vmx(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_vmx

 int vp9_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl);
 #define vp9_vector_var vp9_vector_var_c