From c82e0df0715cee6c9f8653c99ccf723db80b188b Mon Sep 17 00:00:00 2001 From: Thomas Harte Date: Tue, 8 Sep 2020 19:37:36 -0400 Subject: [PATCH] Starts a transition towards half-precision arithmetic. --- .../Clock Signal/ScanTarget/CSScanTarget.mm | 28 +------------- .../Clock Signal/ScanTarget/ScanTarget.metal | 38 +++++++++---------- 2 files changed, 21 insertions(+), 45 deletions(-) diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm index 1b320a09d..0c74d79da 100644 --- a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm +++ b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm @@ -105,7 +105,7 @@ struct Uniforms { float zoom; simd::float2 offset; simd::float3 chromaCoefficients[8]; - float lumaKernel[8]; + __fp16 lumaKernel[8]; float radiansPerPixel; float cyclesMultiplier; float outputAlpha; @@ -167,30 +167,6 @@ std::array boxCoefficients(float radiansPerPixel, float cutoff) { return filter; } -/// @returns the IEEE 754 binary16 conversion of @c value, stored in a 16-bit int. -uint16_t half(float value) { - uint16_t result = 0; - - if(value < 0) { - result |= 0x8000; - value = -value; - } - - int exponent; - const float mantissa = frexpf(value, &exponent); - - // There is a bias of 15 on the exponent; given that the value given by frexp doesn't have the - // implicit first bit — that'll be masked off below — that's like a bias of 14 versus the output - // of frexp. - exponent += 14; - result |= (exponent & 31) << 10; - - // Also store the mantissa. - result |= uint16_t(mantissa * 2048.0f) & 0x3ff; - - return result; -} - } using BufferingScanTarget = Outputs::Display::BufferingScanTarget; @@ -709,7 +685,7 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget; const auto coefficients = boxCoefficients(uniforms()->radiansPerPixel, 3.141592654f); _lumaKernelSize = 15; for(size_t c = 0; c < 8; ++c) { - filter[c] = coefficients[c]; + filter[c] = __fp16(coefficients[c]); if(coefficients[c] < 0.01f) { _lumaKernelSize -= 2; } diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal index 70e6eb5f2..72622e15c 100644 --- a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal +++ b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal @@ -38,7 +38,7 @@ struct Uniforms { // Describes the filter in use for luma filtering; 15 coefficients // symmetrical around the centre. - float lumaKernel[8]; + half lumaKernel[8]; // Maps from pixel offsets into the composition buffer to angular difference. float radiansPerPixel; @@ -384,13 +384,13 @@ fragment float4 clearFragment() { /// Given input pixels of the form (luminance, 0.5 + 0.5*chrominance*cos(phase), 0.5 + 0.5*chrominance*sin(phase)), applies a lowpass /// filter to the two chrominance parts, then uses the toRGB matrix to convert to RGB and stores. -template void filterChromaKernel( texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], +template void filterChromaKernel( texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]], constant Uniforms &uniforms [[buffer(0)]], constant int &offset [[buffer(1)]]) { - constexpr float4 moveToZero = float4(0.0f, 0.5f, 0.5f, 0.0f); - const float4 rawSamples[] = { + constexpr half4 moveToZero(0.0f, 0.5f, 0.5f, 0.0f); + const half4 rawSamples[] = { inTexture.read(gid + uint2(0, offset)) - moveToZero, inTexture.read(gid + uint2(1, offset)) - moveToZero, inTexture.read(gid + uint2(2, offset)) - moveToZero, @@ -408,14 +408,14 @@ template void filterChromaKernel( texture2d void filterChromaKernel( texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]], - constant Uniforms &uniforms [[buffer(0)]], - constant int &offset [[buffer(1)]]) { +kernel void filterChromaKernelNoGamma( texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]], + constant Uniforms &uniforms [[buffer(0)]], + constant int &offset [[buffer(1)]]) { filterChromaKernel(inTexture, outTexture, gid, uniforms, offset); } -kernel void filterChromaKernelWithGamma(texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]], - constant Uniforms &uniforms [[buffer(0)]], - constant int &offset [[buffer(1)]]) { +kernel void filterChromaKernelWithGamma( texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]], + constant Uniforms &uniforms [[buffer(0)]], + constant int &offset [[buffer(1)]]) { filterChromaKernel(inTexture, outTexture, gid, uniforms, offset); } @@ -473,7 +473,7 @@ kernel void separateLumaKernel( texture2d inTexture [[textur inTexture.read(gid + uint2(14, offset)).r, }; -#define Sample(x, y) half(uniforms.lumaKernel[y]) * rawSamples[x] +#define Sample(x, y) uniforms.lumaKernel[y] * rawSamples[x] const half luminance = Sample(0, 0) + Sample(1, 1) + Sample(2, 2) + Sample(3, 3) + Sample(4, 4) + Sample(5, 5) + Sample(6, 6) + Sample(7, 7) +