From c82e0df0715cee6c9f8653c99ccf723db80b188b Mon Sep 17 00:00:00 2001
From: Thomas Harte <thomas.harte@gmail.com>
Date: Tue, 8 Sep 2020 19:37:36 -0400
Subject: [PATCH] Starts a transition towards half-precision arithmetic.

---
 .../Clock Signal/ScanTarget/CSScanTarget.mm   | 28 +-------------
 .../Clock Signal/ScanTarget/ScanTarget.metal  | 38 +++++++++----------
 2 files changed, 21 insertions(+), 45 deletions(-)
diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm
index 1b320a09d..0c74d79da 100644
--- a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm	
+++ b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm	
@@ -105,7 +105,7 @@ struct Uniforms {
 	float zoom;
 	simd::float2 offset;
 	simd::float3 chromaCoefficients[8];
-	float lumaKernel[8];
+	__fp16 lumaKernel[8];
 	float radiansPerPixel;
 	float cyclesMultiplier;
 	float outputAlpha;
@@ -167,30 +167,6 @@ std::array<float, 8> boxCoefficients(float radiansPerPixel, float cutoff) {
 	return filter;
 }
 
-/// @returns the IEEE 754 binary16 conversion of @c value, stored in a 16-bit int.
-uint16_t half(float value) {
-	uint16_t result = 0;
-
-	if(value < 0) {
-		result |= 0x8000;
-		value = -value;
-	}
-
-	int exponent;
-	const float mantissa = frexpf(value, &exponent);
-
-	// There is a bias of 15 on the exponent; given that the value given by frexp doesn't have the
-	// implicit first bit — that'll be masked off below — that's like a bias of 14 versus the output
-	// of frexp.
-	exponent += 14;
-	result |= (exponent & 31) << 10;
-
-	// Also store the mantissa.
-	result |= uint16_t(mantissa * 2048.0f) & 0x3ff;
-
-	return result;
-}
-
 }
 
 using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
@@ -709,7 +685,7 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
 			const auto coefficients = boxCoefficients(uniforms()->radiansPerPixel, 3.141592654f);
 			_lumaKernelSize = 15;
 			for(size_t c = 0; c < 8; ++c) {
-				filter[c] = coefficients[c];
+				filter[c] = __fp16(coefficients[c]);
 				if(coefficients[c] < 0.01f) {
 					_lumaKernelSize -= 2;
 				}
diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal
index 70e6eb5f2..72622e15c 100644
--- a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal	
+++ b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal	
@@ -38,7 +38,7 @@ struct Uniforms {
 
 	// Describes the filter in use for luma filtering; 15 coefficients
 	// symmetrical around the centre.
-	float lumaKernel[8];
+	half lumaKernel[8];
 
 	// Maps from pixel offsets into the composition buffer to angular difference.
 	float radiansPerPixel;
@@ -384,13 +384,13 @@ fragment float4 clearFragment() {
 
 /// Given input pixels of the form (luminance, 0.5 + 0.5*chrominance*cos(phase), 0.5 + 0.5*chrominance*sin(phase)), applies a lowpass
 /// filter to the two chrominance parts, then uses the toRGB matrix to convert to RGB and stores.
-template <bool applyGamma> void filterChromaKernel(	texture2d<float, access::read> inTexture [[texture(0)]],
-													texture2d<float, access::write> outTexture [[texture(1)]],
+template <bool applyGamma> void filterChromaKernel(	texture2d<half, access::read> inTexture [[texture(0)]],
+													texture2d<half, access::write> outTexture [[texture(1)]],
 													uint2 gid [[thread_position_in_grid]],
 													constant Uniforms &uniforms [[buffer(0)]],
 													constant int &offset [[buffer(1)]]) {
-	constexpr float4 moveToZero = float4(0.0f, 0.5f, 0.5f, 0.0f);
-	const float4 rawSamples[] = {
+	constexpr half4 moveToZero(0.0f, 0.5f, 0.5f, 0.0f);
+	const half4 rawSamples[] = {
 		inTexture.read(gid + uint2(0, offset)) - moveToZero,
 		inTexture.read(gid + uint2(1, offset)) - moveToZero,
 		inTexture.read(gid + uint2(2, offset)) - moveToZero,
@@ -408,14 +408,14 @@ template <bool applyGamma> void filterChromaKernel(	texture2d<float, access::rea
 		inTexture.read(gid + uint2(14, offset)) - moveToZero,
 	};
 
-#define Sample(x, y) uniforms.chromaCoefficients[y] * rawSamples[x].rgb
-	const float3 colour =
+#define Sample(x, y) half3(uniforms.chromaCoefficients[y]) * rawSamples[x].rgb
+	const half3 colour =
 		Sample(0, 0) + Sample(1, 1) + Sample(2, 2) + Sample(3, 3) + Sample(4, 4) + Sample(5, 5) + Sample(6, 6) +
 		Sample(7, 7) +
 		Sample(8, 6) + Sample(9, 5) + Sample(10, 4) + Sample(11, 3) + Sample(12, 2) + Sample(13, 1) + Sample(14, 0);
 #undef Sample
 
-	const float4 output = float4(uniforms.toRGB * colour * uniforms.outputMultiplier, uniforms.outputAlpha);
+	const half4 output = half4(half3x3(uniforms.toRGB) * colour * half(uniforms.outputMultiplier), half(uniforms.outputAlpha));
 	if(applyGamma) {
 		outTexture.write(pow(output, uniforms.outputGamma), gid + uint2(7, offset));
 	} else {
@@ -423,19 +423,19 @@ template <bool applyGamma> void filterChromaKernel(	texture2d<float, access::rea
 	}
 }
 
-kernel void filterChromaKernelNoGamma(texture2d<float, access::read> inTexture [[texture(0)]],
-													texture2d<float, access::write> outTexture [[texture(1)]],
-													uint2 gid [[thread_position_in_grid]],
-													constant Uniforms &uniforms [[buffer(0)]],
-													constant int &offset [[buffer(1)]]) {
+kernel void filterChromaKernelNoGamma(	texture2d<half, access::read> inTexture [[texture(0)]],
+										texture2d<half, access::write> outTexture [[texture(1)]],
+										uint2 gid [[thread_position_in_grid]],
+										constant Uniforms &uniforms [[buffer(0)]],
+										constant int &offset [[buffer(1)]]) {
 	filterChromaKernel<false>(inTexture, outTexture, gid, uniforms, offset);
 }
 
-kernel void filterChromaKernelWithGamma(texture2d<float, access::read> inTexture [[texture(0)]],
-													texture2d<float, access::write> outTexture [[texture(1)]],
-													uint2 gid [[thread_position_in_grid]],
-													constant Uniforms &uniforms [[buffer(0)]],
-													constant int &offset [[buffer(1)]]) {
+kernel void filterChromaKernelWithGamma(	texture2d<half, access::read> inTexture [[texture(0)]],
+											texture2d<half, access::write> outTexture [[texture(1)]],
+											uint2 gid [[thread_position_in_grid]],
+											constant Uniforms &uniforms [[buffer(0)]],
+											constant int &offset [[buffer(1)]]) {
 	filterChromaKernel<true>(inTexture, outTexture, gid, uniforms, offset);
 }
 
@@ -473,7 +473,7 @@ kernel void separateLumaKernel(	texture2d<half, access::read> inTexture [[textur
 		inTexture.read(gid + uint2(14, offset)).r,
 	};
 
-#define Sample(x, y) half(uniforms.lumaKernel[y]) * rawSamples[x]
+#define Sample(x, y) uniforms.lumaKernel[y] * rawSamples[x]
 	const half luminance =
 		Sample(0, 0) + Sample(1, 1) + Sample(2, 2) + Sample(3, 3) + Sample(4, 4) + Sample(5, 5) + Sample(6, 6) +
 		Sample(7, 7) +