From 06b7ea5a6e55bd01a6dcfb36076255936b32b6ae Mon Sep 17 00:00:00 2001
From: Thomas Harte <thomas.harte@gmail.com>
Date: Tue, 8 Sep 2020 19:15:19 -0400
Subject: [PATCH] Strips the luma kernel back to 1d.

---
 .../Clock Signal/ScanTarget/CSScanTarget.mm   | 54 ++++++++++++-----
 .../Clock Signal/ScanTarget/ScanTarget.metal  | 58 +++++++++----------
 2 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm
index 3e6f858b8..1b320a09d 100644
--- a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm	
+++ b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm	
@@ -12,6 +12,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cmath>
 
 #include "BufferingScanTarget.hpp"
 #include "FIRFilter.hpp"
@@ -104,7 +105,7 @@ struct Uniforms {
 	float zoom;
 	simd::float2 offset;
 	simd::float3 chromaCoefficients[8];
-	simd::float2 lumaCoefficients[8];
+	float lumaKernel[8];
 	float radiansPerPixel;
 	float cyclesMultiplier;
 	float outputAlpha;
@@ -166,6 +167,30 @@ std::array<float, 8> boxCoefficients(float radiansPerPixel, float cutoff) {
 	return filter;
 }
 
+/// @returns the IEEE 754 binary16 conversion of @c value, stored in a 16-bit int.
+uint16_t half(float value) {
+	uint16_t result = 0;
+
+	if(value < 0) {
+		result |= 0x8000;
+		value = -value;
+	}
+
+	int exponent;
+	const float mantissa = frexpf(value, &exponent);
+
+	// There is a bias of 15 on the exponent; given that the value given by frexp doesn't have the
+	// implicit first bit — that'll be masked off below — that's like a bias of 14 versus the output
+	// of frexp.
+	exponent += 14;
+	result |= (exponent & 31) << 10;
+
+	// Also store the mantissa.
+	result |= uint16_t(mantissa * 2048.0f) & 0x3ff;
+
+	return result;
+}
+
 }
 
 using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
@@ -258,6 +283,9 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
 	BufferingScanTarget::LineMetadata _lineMetadataBuffer[NumBufferedLines];
 	std::atomic_flag _isDrawing;
 
+	// Additional pipeline information.
+	size_t _lumaKernelSize;
+
 	// The output view.
 	__weak MTKView *_view;
 }
@@ -653,7 +681,7 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
 		// Generate the chrominance filter.
 		{
 			auto *const firCoefficients = uniforms()->chromaCoefficients;
-			const auto chromaCoefficients = boxCoefficients(uniforms()->radiansPerPixel, 3.141592654f * 0.5f);
+			const auto chromaCoefficients = boxCoefficients(uniforms()->radiansPerPixel, 3.141592654f);
 			for(size_t c = 0; c < 8; ++c) {
 				firCoefficients[c].y = firCoefficients[c].z = (isSVideoOutput ? 2.0f : 1.0f) * chromaCoefficients[c];
 				firCoefficients[c].x = 0.0f;
@@ -667,7 +695,7 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
 			//
 			// The low cut off ['Hz' but per line, not per second] is somewhat arbitrary.
 			if(!isSVideoOutput) {
-				SignalProcessing::FIRFilter sharpenFilter(15, float(_lineBufferPixelsPerLine), 40.0f, colourCyclesPerLine);
+				SignalProcessing::FIRFilter sharpenFilter(15, float(_lineBufferPixelsPerLine), 20.0f, colourCyclesPerLine);
 				const auto sharpen = sharpenFilter.get_coefficients();
 				for(size_t c = 0; c < 8; ++c) {
 					firCoefficients[c].x = sharpen[c];
@@ -675,20 +703,16 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
 			}
 		}
 
-		// Generate the luminance separation filter.
+		// Generate the luminance separation filter and determine its required size.
 		{
-			auto *const firCoefficients = uniforms()->lumaCoefficients;
-			SignalProcessing::FIRFilter lumaPart(15, float(_lineBufferPixelsPerLine), 0.0f, colourCyclesPerLine * 0.5f);
-//			SignalProcessing::FIRFilter chromaPart(15, float(_lineBufferPixelsPerLine), 0.0f, colourCyclesPerLine * 0.5f);
-
-//			const auto chromaCoefficients = lumaPart.get_coefficients();
-//			const auto lumaCoefficients = lumaPart.get_coefficients();
-			const auto chromaCoefficients = boxCoefficients(uniforms()->radiansPerPixel, 3.141592654f);//chromaPart.get_coefficients();
-			const auto lumaCoefficients = lumaPart.get_coefficients();
-//			const auto chromaCoefficients = lumaCoefficients;
+			auto *const filter = uniforms()->lumaKernel;
+			const auto coefficients = boxCoefficients(uniforms()->radiansPerPixel, 3.141592654f);
+			_lumaKernelSize = 15;
 			for(size_t c = 0; c < 8; ++c) {
-				firCoefficients[c].x = //lumaCoefficients[c];
-				firCoefficients[c].y = chromaCoefficients[c];
+				filter[c] = coefficients[c];
+				if(coefficients[c] < 0.01f) {
+					_lumaKernelSize -= 2;
+				}
 			}
 		}
 	}
diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal
index 98f6df2f0..70e6eb5f2 100644
--- a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal	
+++ b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal	
@@ -36,9 +36,9 @@ struct Uniforms {
 	// 15 coefficients but they're symmetrical around the centre.
 	float3 chromaCoefficients[8];
 
-	// Describes the FIR filter in use for luma filtering; also 15 coefficients
+	// Describes the filter in use for luma filtering; 15 coefficients
 	// symmetrical around the centre.
-	float2 lumaCoefficients[8];
+	float lumaKernel[8];
 
 	// Maps from pixel offsets into the composition buffer to angular difference.
 	float radiansPerPixel;
@@ -449,45 +449,43 @@ kernel void filterChromaKernelWithGamma(texture2d<float, access::read> inTexture
 ///	(luminance, 0.5 + 0.5*chrominance*cos(phase), 0.5 + 0.5*chrominance*sin(phase))
 ///
 /// i.e. the input form for the filterChromaKernel, above].
-kernel void separateLumaKernel(	texture2d<float, access::read> inTexture [[texture(0)]],
-								texture2d<float, access::write> outTexture [[texture(1)]],
+kernel void separateLumaKernel(	texture2d<half, access::read> inTexture [[texture(0)]],
+								texture2d<half, access::write> outTexture [[texture(1)]],
 								uint2 gid [[thread_position_in_grid]],
 								constant Uniforms &uniforms [[buffer(0)]],
 								constant int &offset [[buffer(1)]]) {
-	const float4 centreSample = inTexture.read(gid + uint2(7, offset));
-	const float2 rawSamples[] = {
-		inTexture.read(gid + uint2(0, offset)).rr,
-		inTexture.read(gid + uint2(1, offset)).rr,
-		inTexture.read(gid + uint2(2, offset)).rr,
-		inTexture.read(gid + uint2(3, offset)).rr,
-		inTexture.read(gid + uint2(4, offset)).rr,
-		inTexture.read(gid + uint2(5, offset)).rr,
-		inTexture.read(gid + uint2(6, offset)).rr,
-		centreSample.rr,
-		inTexture.read(gid + uint2(8, offset)).rr,
-		inTexture.read(gid + uint2(9, offset)).rr,
-		inTexture.read(gid + uint2(10, offset)).rr,
-		inTexture.read(gid + uint2(11, offset)).rr,
-		inTexture.read(gid + uint2(12, offset)).rr,
-		inTexture.read(gid + uint2(13, offset)).rr,
-		inTexture.read(gid + uint2(14, offset)).rr,
+	const half4 centreSample = inTexture.read(gid + uint2(7, offset));
+	const half rawSamples[] = {
+		inTexture.read(gid + uint2(0, offset)).r,
+		inTexture.read(gid + uint2(1, offset)).r,
+		inTexture.read(gid + uint2(2, offset)).r,
+		inTexture.read(gid + uint2(3, offset)).r,
+		inTexture.read(gid + uint2(4, offset)).r,
+		inTexture.read(gid + uint2(5, offset)).r,
+		inTexture.read(gid + uint2(6, offset)).r,
+		centreSample.r,
+		inTexture.read(gid + uint2(8, offset)).r,
+		inTexture.read(gid + uint2(9, offset)).r,
+		inTexture.read(gid + uint2(10, offset)).r,
+		inTexture.read(gid + uint2(11, offset)).r,
+		inTexture.read(gid + uint2(12, offset)).r,
+		inTexture.read(gid + uint2(13, offset)).r,
+		inTexture.read(gid + uint2(14, offset)).r,
 	};
 
-#define Sample(x, y) uniforms.lumaCoefficients[y] * rawSamples[x]
-	const float2 luminance =
+#define Sample(x, y) half(uniforms.lumaKernel[y]) * rawSamples[x]
+	const half luminance =
 		Sample(0, 0) + Sample(1, 1) + Sample(2, 2) + Sample(3, 3) + Sample(4, 4) + Sample(5, 5) + Sample(6, 6) +
 		Sample(7, 7) +
 		Sample(8, 6) + Sample(9, 5) + Sample(10, 4) + Sample(11, 3) + Sample(12, 2) + Sample(13, 1) + Sample(14, 0);
 #undef Sample
 
 	// The mix/steps below ensures that the absence of a colour burst leads the colour subcarrier to be discarded.
-	const float isColour = step(0.01, centreSample.a);
-	const float chroma = (centreSample.r - luminance.g) / mix(1.0f, centreSample.a, isColour);
-	outTexture.write(float4(
-//			mix(luminance.g, luminance.r / (1.0f - centreSample.a), isColour),
-			luminance.r / mix(1.0f, (1.0f - centreSample.a), isColour),
-//			luminance.r,
-			isColour * (centreSample.gb - float2(0.5f)) * chroma + float2(0.5f),
+	const half isColour = step(half(0.01f), centreSample.a);
+	const half chroma = (centreSample.r - luminance) / mix(half(1.0f), centreSample.a, isColour);
+	outTexture.write(half4(
+			luminance / mix(half(1.0f), (half(1.0f) - centreSample.a), isColour),
+			isColour * (centreSample.gb - half2(0.5f)) * chroma + half2(0.5f),
 			1.0f
 		),
 		gid + uint2(7, offset));