From 9e2bf2af7e90d407abf84e77ecf80ee223af07f1 Mon Sep 17 00:00:00 2001 From: Thomas Harte Date: Tue, 1 Sep 2020 21:27:40 -0400 Subject: [PATCH] Restricts S-Video processing to updated lines. --- .../Clock Signal/ScanTarget/CSScanTarget.mm | 68 +++++++++++++------ .../Clock Signal/ScanTarget/ScanTarget.metal | 36 +++++----- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm index 763a464c5..502e4b31c 100644 --- a/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm +++ b/OSBindings/Mac/Clock Signal/ScanTarget/CSScanTarget.mm @@ -207,6 +207,11 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget; id _separatedLumaState; NSUInteger _lineBufferPixelsPerLine; + size_t _lineOffsetBuffer; + id _lineOffsetBuffers[NumBufferedLines]; // Allocating NumBufferedLines buffers ensures these can't possibly be exhausted; + // for this list to be exhausted there'd have to be more draw calls in flight than + // there are lines for them to operate upon. + // The scan target in C++-world terms and the non-GPU storage for it. BufferingScanTarget _scanTarget; BufferingScanTarget::LineMetadata _lineMetadataBuffer[NumBufferedLines]; @@ -266,6 +271,13 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget; depthStencilDescriptor.frontFaceStencil.stencilFailureOperation = MTLStencilOperationReplace; _clearStencilState = [view.device newDepthStencilStateWithDescriptor:depthStencilDescriptor]; + // Allocate a large number of single-int buffers, for supplying offsets to the compute shaders. + // There's a ridiculous amount of overhead in this, but it avoids allocations during drawing, + // and a single int per instance is all I need. + for(size_t c = 0; c < NumBufferedLines; ++c) { + _lineOffsetBuffers[c] = [_view.device newBufferWithLength:sizeof(int) options:SharedResourceOptionsStandard]; + } + // Ensure the is-drawing flag is initially clear. _isDrawing.clear(); } @@ -667,7 +679,12 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget; [encoder endEncoding]; } -- (void)dispatchComputeCommandEncoder:(id)encoder pipelineState:(id)pipelineState width:(NSUInteger)width height:(NSUInteger)height { +- (void)dispatchComputeCommandEncoder:(id)encoder pipelineState:(id)pipelineState width:(NSUInteger)width height:(NSUInteger)height offset:(size_t)offset { + // Store and apply the offset. + *(reinterpret_cast(_lineOffsetBuffers[_lineOffsetBuffer].contents)) = int(offset); + [encoder setBuffer:_lineOffsetBuffers[_lineOffsetBuffer] offset:0 atIndex:1]; + _lineOffsetBuffer = (_lineOffsetBuffer + 1) % NumBufferedLines; + // This follows the recommendations at https://developer.apple.com/documentation/metal/calculating_threadgroup_and_grid_sizes ; // I currently have no independent opinion whatsoever. const MTLSize threadsPerThreadgroup = MTLSizeMake( @@ -677,6 +694,7 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget; ); const MTLSize threadsPerGrid = MTLSizeMake(width, height, 1); + // Set the pipeline state and dispatch the drawing. Which may slightly overdraw. [encoder setComputePipelineState:pipelineState]; [encoder dispatchThreads:threadsPerGrid threadsPerThreadgroup:threadsPerThreadgroup]; } @@ -740,29 +758,37 @@ using BufferingScanTarget = Outputs::Display::BufferingScanTarget; // Build the composition buffer. [self composeOutputArea:outputArea commandBuffer:commandBuffer]; - // Filter to the finalised line texture. - id computeEncoder = [commandBuffer computeCommandEncoder]; - [computeEncoder setTexture:_compositionTexture atIndex:0]; - [computeEncoder setTexture:_finalisedLineTexture atIndex:1]; - [computeEncoder setBuffer:_uniformsBuffer offset:0 atIndex:0]; + if(outputArea.end.line != outputArea.start.line) { + // Filter to the finalised line texture. + id computeEncoder = [commandBuffer computeCommandEncoder]; + [computeEncoder setTexture:_compositionTexture atIndex:0]; + [computeEncoder setTexture:_finalisedLineTexture atIndex:1]; + [computeEncoder setBuffer:_uniformsBuffer offset:0 atIndex:0]; - // TODO: limit processed area to those lines that are actually in use. - [self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:NumBufferedLines]; - - [computeEncoder endEncoding]; - - // Output lines, broken up by frame. - size_t startLine = outputArea.start.line; - size_t line = outputArea.start.line; - while(line != outputArea.end.line) { - if(_lineMetadataBuffer[line].is_first_in_frame && _lineMetadataBuffer[line].previous_frame_was_complete) { - [self outputFrom:startLine to:line commandBuffer:commandBuffer]; - [self outputFrameCleanerToCommandBuffer:commandBuffer]; - startLine = line; + if(outputArea.end.line > outputArea.start.line) { + [self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:outputArea.end.line - outputArea.start.line offset:outputArea.start.line]; + } else { + [self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:NumBufferedLines - outputArea.start.line offset:outputArea.start.line]; + if(outputArea.end.line) { + [self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:outputArea.end.line offset:0]; + } } - line = (line + 1) % NumBufferedLines; + + [computeEncoder endEncoding]; + + // Output lines, broken up by frame. + size_t startLine = outputArea.start.line; + size_t line = outputArea.start.line; + while(line != outputArea.end.line) { + if(_lineMetadataBuffer[line].is_first_in_frame && _lineMetadataBuffer[line].previous_frame_was_complete) { + [self outputFrom:startLine to:line commandBuffer:commandBuffer]; + [self outputFrameCleanerToCommandBuffer:commandBuffer]; + startLine = line; + } + line = (line + 1) % NumBufferedLines; + } + [self outputFrom:startLine to:outputArea.end.line commandBuffer:commandBuffer]; } - [self outputFrom:startLine to:outputArea.end.line commandBuffer:commandBuffer]; } break; } diff --git a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal index d2fcb82ed..f5eb88940 100644 --- a/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal +++ b/OSBindings/Mac/Clock Signal/ScanTarget/ScanTarget.metal @@ -7,6 +7,7 @@ // #include + using namespace metal; // TODO: I'm being very loose, so far, in use of alpha. Sometimes it's 0.64, somtimes its 1.0. @@ -402,24 +403,25 @@ fragment float4 interpolateFragment(CopyInterpolator vert [[stage_in]], texture2 kernel void filterChromaKernel( texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]], - constant Uniforms &uniforms [[buffer(0)]]) { + constant Uniforms &uniforms [[buffer(0)]], + constant int &offset [[buffer(1)]]) { constexpr float4 moveToZero = float4(0.0f, 0.5f, 0.5f, 0.0f); const float4 rawSamples[] = { - inTexture.read(gid) - moveToZero, - inTexture.read(gid + uint2(1, 0)) - moveToZero, - inTexture.read(gid + uint2(2, 0)) - moveToZero, - inTexture.read(gid + uint2(3, 0)) - moveToZero, - inTexture.read(gid + uint2(4, 0)) - moveToZero, - inTexture.read(gid + uint2(5, 0)) - moveToZero, - inTexture.read(gid + uint2(6, 0)) - moveToZero, - inTexture.read(gid + uint2(7, 0)) - moveToZero, - inTexture.read(gid + uint2(8, 0)) - moveToZero, - inTexture.read(gid + uint2(9, 0)) - moveToZero, - inTexture.read(gid + uint2(10, 0)) - moveToZero, - inTexture.read(gid + uint2(11, 0)) - moveToZero, - inTexture.read(gid + uint2(12, 0)) - moveToZero, - inTexture.read(gid + uint2(13, 0)) - moveToZero, - inTexture.read(gid + uint2(14, 0)) - moveToZero, + inTexture.read(gid + uint2(0, offset)) - moveToZero, + inTexture.read(gid + uint2(1, offset)) - moveToZero, + inTexture.read(gid + uint2(2, offset)) - moveToZero, + inTexture.read(gid + uint2(3, offset)) - moveToZero, + inTexture.read(gid + uint2(4, offset)) - moveToZero, + inTexture.read(gid + uint2(5, offset)) - moveToZero, + inTexture.read(gid + uint2(6, offset)) - moveToZero, + inTexture.read(gid + uint2(7, offset)) - moveToZero, + inTexture.read(gid + uint2(8, offset)) - moveToZero, + inTexture.read(gid + uint2(9, offset)) - moveToZero, + inTexture.read(gid + uint2(10, offset)) - moveToZero, + inTexture.read(gid + uint2(11, offset)) - moveToZero, + inTexture.read(gid + uint2(12, offset)) - moveToZero, + inTexture.read(gid + uint2(13, offset)) - moveToZero, + inTexture.read(gid + uint2(14, offset)) - moveToZero, }; #define Sample(x, y) uniforms.firCoefficients[y] * rawSamples[x].rgb @@ -429,5 +431,5 @@ kernel void filterChromaKernel( texture2d inTexture [[textu Sample(8, 6) + Sample(9, 5) + Sample(10, 4) + Sample(11, 3) + Sample(12, 2) + Sample(13, 1) + Sample(14, 0); #undef Sample - outTexture.write(float4(uniforms.toRGB * colour, 1.0f), gid + uint2(7, 0)); + outTexture.write(float4(uniforms.toRGB * colour, 1.0f), gid + uint2(7, offset)); }