mirror of
https://github.com/TomHarte/CLK.git
synced 2024-11-26 23:52:26 +00:00
1207 lines
53 KiB
Plaintext
1207 lines
53 KiB
Plaintext
//
|
|
// ScanTarget.m
|
|
// Clock Signal
|
|
//
|
|
// Created by Thomas Harte on 02/08/2020.
|
|
// Copyright © 2020 Thomas Harte. All rights reserved.
|
|
//
|
|
|
|
#import "CSScanTarget.h"
|
|
|
|
#import <Metal/Metal.h>
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cmath>
|
|
|
|
#include "BufferingScanTarget.hpp"
|
|
#include "FIRFilter.hpp"
|
|
|
|
/*
|
|
|
|
RGB and composite monochrome
|
|
----------------------------
|
|
|
|
Source data is converted to 32bpp RGB or to composite directly from its input, at output resolution.
|
|
Gamma correction is applied unless the inputs are 1bpp (e.g. Macintosh-style black/white, TTL-style RGB).
|
|
|
|
S-Video
|
|
-------
|
|
|
|
Source data is pasted together with a common clock in the composition buffer. Colour phase is baked in
|
|
at this point. Format within the composition buffer is:
|
|
|
|
.r = luminance
|
|
.g = 0.5 + 0.5 * chrominance * cos(phase)
|
|
.b = 0.5 + 0.5 * chrominance * sin(phase)
|
|
|
|
Contents of the composition buffer are then drawn into the finalised line texture; at this point a suitable
|
|
low-filter is applied to the two chrominance channels, colours are converted to RGB and gamma corrected.
|
|
|
|
Contents from the finalised line texture are then painted to the display.
|
|
|
|
Composite colour
|
|
----------------
|
|
|
|
Source data is pasted together with a common clock in the composition buffer. Colour phase and amplitude are
|
|
recorded at this point. Format within the composition buffer is:
|
|
|
|
.r = composite value
|
|
.g = 0.5 + 0.5 * cos(phase)
|
|
.b = 0.5 + 0.5 * sin(phase)
|
|
.a = amplitude
|
|
|
|
[aside: upfront calculation of cos/sin is just because it'll need to be calculated at this precision anyway,
|
|
and doing it here avoids having to do unit<->radian conversions on phase alone]
|
|
|
|
Contents of the composition buffer are transferred to the separated-luma buffer, subject to a low-paass filter
|
|
that has sought to separate luminance and chrominance, and with phase and amplitude now baked into the latter:
|
|
|
|
.r = luminance
|
|
.g = 0.5 + 0.5 * chrominance * cos(phase)
|
|
.b = 0.5 + 0.5 * chrominance * sin(phase)
|
|
|
|
The process now continues as per the corresponding S-Video steps.
|
|
|
|
NOTES
|
|
-----
|
|
|
|
1) for many of the input pixel formats it would be possible to do the trigonometric side of things at
|
|
arbitrary precision. Since it would always be necessary to support fixed-precision processing because
|
|
of the directly-sampled input formats, I've used fixed throughout to reduce the number of permutations
|
|
and combinations of code I need to support. The precision is always selected to be at least four times
|
|
the colour clock.
|
|
|
|
2) I experimented with skipping the separated-luma buffer for composite colour based on the observation that
|
|
just multiplying the raw signal by sin and cos and then filtering well below the colour subcarrier frequency
|
|
should be sufficient. It wasn't in practice because the bits of luminance that don't quite separate are then
|
|
of such massive amplitude that you get huge bands of bright colour in place of the usual chroma dots.
|
|
|
|
3) I also initially didn't want to have a finalied-line texture, but processing costs changed my mind on that.
|
|
If you accept that output will be fixed precision, anyway. In that case, processing for a typical NTSC frame
|
|
in its original resolution means applying filtering (i.e. at least 15 samples per pixel) likely between
|
|
218,400 and 273,000 times per output frame, then upscaling from there at 1 sample per pixel. Count the second
|
|
sample twice for the original store and you're talking between 16*218,400 = 3,494,400 to 16*273,000 = 4,368,000
|
|
total pixel accesses. Though that's not a perfect way to measure cost, roll with it.
|
|
|
|
On my 4k monitor, doing it at actual output resolution would instead cost 3840*2160*15 = 124,416,000 total
|
|
accesses. Which doesn't necessarily mean "more than 28 times as much", but does mean "a lot more".
|
|
|
|
(going direct-to-display for composite monochrome means evaluating sin/cos a lot more often than it might
|
|
with more buffering in between, but that doesn't provisionally seem to be as much of a bottleneck)
|
|
*/
|
|
|
|
namespace {
|
|
|
|
/// Provides a container for __fp16 versions of tightly-packed single-precision plain old data with a copy assignment constructor.
|
|
template <typename NaturalType> struct HalfConverter {
|
|
__fp16 elements[sizeof(NaturalType) / sizeof(float)];
|
|
|
|
void operator =(const NaturalType &rhs) {
|
|
const float *floatRHS = reinterpret_cast<const float *>(&rhs);
|
|
for(size_t c = 0; c < sizeof(elements) / sizeof(*elements); ++c) {
|
|
elements[c] = __fp16(floatRHS[c]);
|
|
}
|
|
}
|
|
};
|
|
|
|
// Tracks the Uniforms struct declared in ScanTarget.metal; see there for field definitions.
|
|
//
|
|
// __fp16 is a Clang-specific type which I'm using as equivalent to a Metal half, i.e. an IEEE 754 binary16.
|
|
struct Uniforms {
|
|
int32_t scale[2];
|
|
float cyclesMultiplier;
|
|
float lineWidth;
|
|
|
|
simd::float3x3 sourcetoDisplay;
|
|
|
|
HalfConverter<simd::float3x3> toRGB;
|
|
HalfConverter<simd::float3x3> fromRGB;
|
|
|
|
HalfConverter<simd::float3> chromaKernel[8];
|
|
__fp16 lumaKernel[8];
|
|
|
|
__fp16 outputAlpha;
|
|
__fp16 outputGamma;
|
|
__fp16 outputMultiplier;
|
|
};
|
|
|
|
constexpr size_t NumBufferedLines = 500;
|
|
constexpr size_t NumBufferedScans = NumBufferedLines * 4;
|
|
|
|
/// The shared resource options this app would most favour; applied as widely as possible.
|
|
constexpr MTLResourceOptions SharedResourceOptionsStandard = MTLResourceCPUCacheModeWriteCombined | MTLResourceStorageModeShared;
|
|
|
|
/// The shared resource options used for the write-area texture; on macOS it can't be MTLResourceStorageModeShared so this is a carve-out.
|
|
constexpr MTLResourceOptions SharedResourceOptionsTexture = MTLResourceCPUCacheModeWriteCombined | MTLResourceStorageModeManaged;
|
|
|
|
#define uniforms() reinterpret_cast<Uniforms *>(_uniformsBuffer.contents)
|
|
|
|
#define RangePerform(start, end, size, func) \
|
|
if((start) != (end)) { \
|
|
if((start) < (end)) { \
|
|
func((start), (end) - (start)); \
|
|
} else { \
|
|
func((start), (size) - (start)); \
|
|
if(end) { \
|
|
func(0, (end)); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
/// @returns the proper 1d kernel to apply a box filter around a certain point a pixel density of @c radiansPerPixel and applying an
|
|
/// angular limit of @c cutoff. The values returned will be the first eight of a fifteen-point filter that is symmetrical around its centre.
|
|
std::array<float, 8> boxCoefficients(float radiansPerPixel, float cutoff) {
|
|
std::array<float, 8> filter;
|
|
float total = 0.0f;
|
|
|
|
for(size_t c = 0; c < 8; ++c) {
|
|
// This coefficient occupies the angular window [6.5-c, 7.5-c]*radiansPerPixel.
|
|
const float startAngle = (6.5f - float(c)) * radiansPerPixel;
|
|
const float endAngle = (7.5f - float(c)) * radiansPerPixel;
|
|
|
|
float coefficient = 0.0f;
|
|
if(endAngle < cutoff) {
|
|
coefficient = 1.0f;
|
|
} else if(startAngle >= cutoff) {
|
|
coefficient = 0.0f;
|
|
} else {
|
|
coefficient = (cutoff - startAngle) / radiansPerPixel;
|
|
}
|
|
total += 2.0f * coefficient; // All but the centre coefficient will be used twice.
|
|
filter[c] = coefficient;
|
|
}
|
|
total = total - filter[7]; // As per above; ensure the centre coefficient is counted only once.
|
|
|
|
for(size_t c = 0; c < 8; ++c) {
|
|
filter[c] /= total;
|
|
}
|
|
|
|
return filter;
|
|
}
|
|
|
|
}
|
|
|
|
using BufferingScanTarget = Outputs::Display::BufferingScanTarget;
|
|
|
|
@implementation CSScanTarget {
|
|
// The command queue for the device in use.
|
|
id<MTLCommandQueue> _commandQueue;
|
|
|
|
// Pipelines.
|
|
id<MTLRenderPipelineState> _composePipeline; // For rendering to the composition texture.
|
|
id<MTLRenderPipelineState> _outputPipeline; // For drawing to the frame buffer.
|
|
id<MTLRenderPipelineState> _copyPipeline; // For copying from one texture to another.
|
|
id<MTLRenderPipelineState> _supersamplePipeline; // For resampling from one texture to one that is 1/4 as large.
|
|
id<MTLRenderPipelineState> _clearPipeline; // For applying additional inter-frame clearing (cf. the stencil).
|
|
|
|
// Buffers.
|
|
id<MTLBuffer> _uniformsBuffer; // A static buffer, containing a copy of the Uniforms struct.
|
|
id<MTLBuffer> _scansBuffer; // A dynamic buffer, into which the CPU writes Scans for later display.
|
|
id<MTLBuffer> _linesBuffer; // A dynamic buffer, into which the CPU writes Lines for later display.
|
|
|
|
// Textures: the write area.
|
|
//
|
|
// The write area receives fragments of output from the emulated machine.
|
|
// So it is written by the CPU and read by the GPU.
|
|
id<MTLTexture> _writeAreaTexture;
|
|
id<MTLBuffer> _writeAreaBuffer; // The storage underlying the write-area texture.
|
|
size_t _bytesPerInputPixel; // Determines per-pixel sizing within the write-area texture.
|
|
size_t _totalTextureBytes; // Holds the total size of the write-area texture.
|
|
|
|
// Textures: the frame buffer.
|
|
//
|
|
// When inter-frame blending is in use, the frame buffer contains the most recent output.
|
|
// Metal isn't really set up for single-buffered output, so this acts as if it were that
|
|
// single buffer. This texture is complete 2d data, copied directly to the display.
|
|
id<MTLTexture> _frameBuffer;
|
|
MTLRenderPassDescriptor *_frameBufferRenderPass; // The render pass for _drawing to_ the frame buffer.
|
|
BOOL _dontClearFrameBuffer;
|
|
|
|
// Textures: the stencil.
|
|
//
|
|
// Scan targets recceive scans, not full frames. Those scans may not cover the entire display,
|
|
// either because unlit areas have been omitted or because a sync discrepancy means that the full
|
|
// potential vertical or horizontal width of the display isn't used momentarily.
|
|
//
|
|
// In order to manage inter-frame blending correctly in those cases, a stencil is attached to the
|
|
// frame buffer so that a clearing step can darken any pixels that weren't naturally painted during
|
|
// any frame.
|
|
id<MTLTexture> _frameBufferStencil;
|
|
id<MTLDepthStencilState> _drawStencilState; // Always draws, sets stencil to 1.
|
|
id<MTLDepthStencilState> _clearStencilState; // Draws only where stencil is 0, clears all to 0.
|
|
|
|
// Textures: the composition texture.
|
|
//
|
|
// If additional temporal processing is required (i.e. for S-Video and colour composite output),
|
|
// fragments from the write-area texture are assembled into the composition texture, where they
|
|
// properly adjoin their neighbours and everything is converted to a common clock.
|
|
id<MTLTexture> _compositionTexture;
|
|
MTLRenderPassDescriptor *_compositionRenderPass; // The render pass for _drawing to_ the composition buffer.
|
|
|
|
enum class Pipeline {
|
|
/// Scans are painted directly to the frame buffer.
|
|
DirectToDisplay,
|
|
/// Scans are painted to the composition buffer, which is processed to the finalised line buffer,
|
|
/// from which lines are painted to the frame buffer.
|
|
SVideo,
|
|
/// Scans are painted to the composition buffer, which is processed to the separated luma buffer and then the finalised line buffer,
|
|
/// from which lines are painted to the frame buffer.
|
|
CompositeColour
|
|
|
|
// TODO: decide what to do for downard-scaled direct-to-display. Obvious options are to include lowpass
|
|
// filtering into the scan outputter and contine hoping that the vertical takes care of itself, or maybe
|
|
// to stick with DirectToDisplay but with a minimum size for the frame buffer and apply filtering from
|
|
// there to the screen.
|
|
};
|
|
Pipeline _pipeline;
|
|
|
|
// Textures: additional storage used when processing S-Video and composite colour input.
|
|
id<MTLTexture> _finalisedLineTexture;
|
|
id<MTLComputePipelineState> _finalisedLineState;
|
|
id<MTLTexture> _separatedLumaTexture;
|
|
id<MTLComputePipelineState> _separatedLumaState;
|
|
NSUInteger _lineBufferPixelsPerLine;
|
|
|
|
size_t _lineOffsetBuffer;
|
|
id<MTLBuffer> _lineOffsetBuffers[NumBufferedLines]; // Allocating NumBufferedLines buffers ensures these can't possibly be exhausted;
|
|
// for this list to be exhausted there'd have to be more draw calls in flight than
|
|
// there are lines for them to operate upon.
|
|
|
|
// The scan target in C++-world terms and the non-GPU storage for it.
|
|
BufferingScanTarget _scanTarget;
|
|
BufferingScanTarget::LineMetadata _lineMetadataBuffer[NumBufferedLines];
|
|
std::atomic_flag _isDrawing;
|
|
|
|
// Additional pipeline information.
|
|
size_t _lumaKernelSize;
|
|
size_t _chromaKernelSize;
|
|
std::atomic<bool> _isUsingSupersampling;
|
|
|
|
// The output view and its aspect ratio.
|
|
__weak MTKView *_view;
|
|
CGFloat _viewAspectRatio; // To avoid accessing .bounds away from the main thread.
|
|
}
|
|
|
|
- (nonnull instancetype)initWithView:(nonnull MTKView *)view {
|
|
self = [super init];
|
|
if(self) {
|
|
_view = view;
|
|
_commandQueue = [view.device newCommandQueue];
|
|
|
|
// Allocate space for uniforms.
|
|
_uniformsBuffer = [view.device
|
|
newBufferWithLength:sizeof(Uniforms)
|
|
options:MTLResourceCPUCacheModeWriteCombined | MTLResourceStorageModeShared];
|
|
|
|
// Allocate buffers for scans and lines and for the write area texture.
|
|
_scansBuffer = [view.device
|
|
newBufferWithLength:sizeof(Outputs::Display::BufferingScanTarget::Scan)*NumBufferedScans
|
|
options:SharedResourceOptionsStandard];
|
|
_linesBuffer = [view.device
|
|
newBufferWithLength:sizeof(Outputs::Display::BufferingScanTarget::Line)*NumBufferedLines
|
|
options:SharedResourceOptionsStandard];
|
|
_writeAreaBuffer = [view.device
|
|
newBufferWithLength:BufferingScanTarget::WriteAreaWidth*BufferingScanTarget::WriteAreaHeight*4
|
|
options:SharedResourceOptionsTexture];
|
|
|
|
// Install all that storage in the buffering scan target.
|
|
_scanTarget.set_write_area(reinterpret_cast<uint8_t *>(_writeAreaBuffer.contents));
|
|
_scanTarget.set_line_buffer(reinterpret_cast<BufferingScanTarget::Line *>(_linesBuffer.contents), _lineMetadataBuffer, NumBufferedLines);
|
|
_scanTarget.set_scan_buffer(reinterpret_cast<BufferingScanTarget::Scan *>(_scansBuffer.contents), NumBufferedScans);
|
|
|
|
// Generate copy and clear pipelines.
|
|
id<MTLLibrary> library = [_view.device newDefaultLibrary];
|
|
MTLRenderPipelineDescriptor *const pipelineDescriptor = [[MTLRenderPipelineDescriptor alloc] init];
|
|
pipelineDescriptor.colorAttachments[0].pixelFormat = _view.colorPixelFormat;
|
|
pipelineDescriptor.vertexFunction = [library newFunctionWithName:@"copyVertex"];
|
|
pipelineDescriptor.fragmentFunction = [library newFunctionWithName:@"copyFragment"];
|
|
_copyPipeline = [_view.device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:nil];
|
|
|
|
pipelineDescriptor.fragmentFunction = [library newFunctionWithName:@"interpolateFragment"];
|
|
_supersamplePipeline = [_view.device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:nil];
|
|
|
|
pipelineDescriptor.fragmentFunction = [library newFunctionWithName:@"clearFragment"];
|
|
pipelineDescriptor.stencilAttachmentPixelFormat = MTLPixelFormatStencil8;
|
|
_clearPipeline = [_view.device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:nil];
|
|
|
|
// Clear stencil: always write the reference value (of 0), but draw only where the stencil already
|
|
// had that value.
|
|
MTLDepthStencilDescriptor *depthStencilDescriptor = [[MTLDepthStencilDescriptor alloc] init];
|
|
depthStencilDescriptor.frontFaceStencil.stencilCompareFunction = MTLCompareFunctionEqual;
|
|
depthStencilDescriptor.frontFaceStencil.depthStencilPassOperation = MTLStencilOperationReplace;
|
|
depthStencilDescriptor.frontFaceStencil.stencilFailureOperation = MTLStencilOperationReplace;
|
|
_clearStencilState = [view.device newDepthStencilStateWithDescriptor:depthStencilDescriptor];
|
|
|
|
// Allocate a large number of single-int buffers, for supplying offsets to the compute shaders.
|
|
// There's a ridiculous amount of overhead in this, but it avoids allocations during drawing,
|
|
// and a single int per instance is all I need.
|
|
for(size_t c = 0; c < NumBufferedLines; ++c) {
|
|
_lineOffsetBuffers[c] = [_view.device newBufferWithLength:sizeof(int) options:SharedResourceOptionsStandard];
|
|
}
|
|
|
|
// Ensure the is-drawing flag is initially clear.
|
|
_isDrawing.clear();
|
|
|
|
// Set initial aspect-ratio multiplier and generate buffers.
|
|
[self mtkView:view drawableSizeWillChange:view.drawableSize];
|
|
}
|
|
|
|
return self;
|
|
}
|
|
|
|
/*!
|
|
@method mtkView:drawableSizeWillChange:
|
|
@abstract Called whenever the drawableSize of the view will change
|
|
@discussion Delegate can recompute view and projection matricies or regenerate any buffers to be compatible with the new view size or resolution
|
|
@param view MTKView which called this method
|
|
@param size New drawable size in pixels
|
|
*/
|
|
- (void)mtkView:(nonnull MTKView *)view drawableSizeWillChange:(CGSize)size {
|
|
_viewAspectRatio = size.width / size.height;
|
|
[self setAspectRatio];
|
|
|
|
@synchronized(self) {
|
|
// Always [re]try multisampling upon a resize.
|
|
_scanTarget.display_metrics_.announce_did_resize();
|
|
_isUsingSupersampling = true;
|
|
[self updateSizeBuffersToSize:size];
|
|
}
|
|
}
|
|
|
|
- (void)updateSizeBuffers {
|
|
@synchronized(self) {
|
|
[self updateSizeBuffersToSize:_view.drawableSize];
|
|
}
|
|
}
|
|
|
|
- (id<MTLCommandBuffer>)copyTexture:(id<MTLTexture>)source to:(id<MTLTexture>)destination {
|
|
MTLRenderPassDescriptor *const copyTextureDescriptor = [[MTLRenderPassDescriptor alloc] init];
|
|
copyTextureDescriptor.colorAttachments[0].texture = destination;
|
|
copyTextureDescriptor.colorAttachments[0].loadAction = MTLLoadActionDontCare;
|
|
copyTextureDescriptor.colorAttachments[0].storeAction = MTLStoreActionStore;
|
|
|
|
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
|
|
id<MTLRenderCommandEncoder> encoder = [commandBuffer renderCommandEncoderWithDescriptor:copyTextureDescriptor];
|
|
|
|
[encoder setRenderPipelineState:_copyPipeline];
|
|
[encoder setVertexTexture:source atIndex:0];
|
|
[encoder setFragmentTexture:source atIndex:0];
|
|
|
|
[encoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:0 vertexCount:4];
|
|
[encoder endEncoding];
|
|
[commandBuffer commit];
|
|
|
|
return commandBuffer;
|
|
}
|
|
|
|
- (void)updateSizeBuffersToSize:(CGSize)size {
|
|
// Anecdotally, the size provided here, which ultimately is from _view.drawableSize,
|
|
// already factors in Retina-style scaling.
|
|
//
|
|
// 16384 has been the maximum texture size in all Mac versions of Metal so far, and
|
|
// I haven't yet found a way to query it dynamically. So it's hard-coded.
|
|
const NSUInteger frameBufferWidth = MIN(NSUInteger(size.width) * (_isUsingSupersampling ? 2 : 1), 16384);
|
|
const NSUInteger frameBufferHeight = MIN(NSUInteger(size.height) * (_isUsingSupersampling ? 2 : 1), 16384);
|
|
|
|
// Generate a framebuffer and a stencil.
|
|
MTLTextureDescriptor *const textureDescriptor = [MTLTextureDescriptor
|
|
texture2DDescriptorWithPixelFormat:_view.colorPixelFormat
|
|
width:frameBufferWidth
|
|
height:frameBufferHeight
|
|
mipmapped:NO];
|
|
textureDescriptor.usage = MTLTextureUsageRenderTarget | MTLTextureUsageShaderRead | MTLTextureUsageShaderWrite;
|
|
textureDescriptor.resourceOptions = MTLResourceStorageModePrivate;
|
|
id<MTLTexture> _oldFrameBuffer = _frameBuffer;
|
|
_frameBuffer = [_view.device newTextureWithDescriptor:textureDescriptor];
|
|
|
|
MTLTextureDescriptor *const stencilTextureDescriptor = [MTLTextureDescriptor
|
|
texture2DDescriptorWithPixelFormat:MTLPixelFormatStencil8
|
|
width:frameBufferWidth
|
|
height:frameBufferHeight
|
|
mipmapped:NO];
|
|
stencilTextureDescriptor.usage = MTLTextureUsageRenderTarget;
|
|
stencilTextureDescriptor.resourceOptions = MTLResourceStorageModePrivate;
|
|
_frameBufferStencil = [_view.device newTextureWithDescriptor:stencilTextureDescriptor];
|
|
|
|
// Generate a render pass with that framebuffer and stencil.
|
|
_frameBufferRenderPass = [[MTLRenderPassDescriptor alloc] init];
|
|
_frameBufferRenderPass.colorAttachments[0].texture = _frameBuffer;
|
|
_frameBufferRenderPass.colorAttachments[0].loadAction = MTLLoadActionLoad;
|
|
_frameBufferRenderPass.colorAttachments[0].storeAction = MTLStoreActionStore;
|
|
|
|
_frameBufferRenderPass.stencilAttachment.clearStencil = 0;
|
|
_frameBufferRenderPass.stencilAttachment.texture = _frameBufferStencil;
|
|
_frameBufferRenderPass.stencilAttachment.loadAction = MTLLoadActionLoad;
|
|
_frameBufferRenderPass.stencilAttachment.storeAction = MTLStoreActionStore;
|
|
|
|
// Establish intended stencil useage; it's only to track which pixels haven't been painted
|
|
// at all at the end of every frame. So: always paint, and replace the stored stencil value
|
|
// (which is seeded as 0) with the nominated one (a 1).
|
|
MTLDepthStencilDescriptor *depthStencilDescriptor = [[MTLDepthStencilDescriptor alloc] init];
|
|
depthStencilDescriptor.frontFaceStencil.stencilCompareFunction = MTLCompareFunctionAlways;
|
|
depthStencilDescriptor.frontFaceStencil.depthStencilPassOperation = MTLStencilOperationReplace;
|
|
_drawStencilState = [_view.device newDepthStencilStateWithDescriptor:depthStencilDescriptor];
|
|
|
|
// Draw from _oldFrameBuffer to _frameBuffer; otherwise clear the new framebuffer.
|
|
if(_oldFrameBuffer) {
|
|
[self copyTexture:_oldFrameBuffer to:_frameBuffer];
|
|
} else {
|
|
// TODO: this use of clearTexture is the only reasn _frameBuffer has a marked usage of MTLTextureUsageShaderWrite;
|
|
// it'd probably be smarter to blank it with geometry rather than potentially complicating
|
|
// its storage further?
|
|
[self clearTexture:_frameBuffer];
|
|
}
|
|
|
|
// Don't clear the framebuffer at the end of this frame.
|
|
_dontClearFrameBuffer = YES;
|
|
}
|
|
|
|
- (BOOL)shouldApplyGamma {
|
|
return fabsf(float(uniforms()->outputGamma) - 1.0f) > 0.01f;
|
|
}
|
|
|
|
- (void)clearTexture:(id<MTLTexture>)texture {
|
|
id<MTLLibrary> library = [_view.device newDefaultLibrary];
|
|
|
|
// Ensure finalised line texture is initially clear.
|
|
id<MTLComputePipelineState> clearPipeline = [_view.device newComputePipelineStateWithFunction:[library newFunctionWithName:@"clearKernel"] error:nil];
|
|
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
|
|
id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
|
|
|
|
[computeEncoder setTexture:texture atIndex:0];
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:clearPipeline width:texture.width height:texture.height offsetBuffer:[self bufferForOffset:0]];
|
|
|
|
[computeEncoder endEncoding];
|
|
[commandBuffer commit];
|
|
}
|
|
|
|
- (void)updateModalBuffers {
|
|
// Build a descriptor for any intermediate line texture.
|
|
MTLTextureDescriptor *const lineTextureDescriptor = [MTLTextureDescriptor
|
|
texture2DDescriptorWithPixelFormat:MTLPixelFormatBGRA8Unorm
|
|
width:2048 // This 'should do'.
|
|
height:NumBufferedLines
|
|
mipmapped:NO];
|
|
lineTextureDescriptor.resourceOptions = MTLResourceStorageModePrivate;
|
|
|
|
if(_pipeline == Pipeline::DirectToDisplay) {
|
|
// Buffers are not required when outputting direct to display; so if this isn't that then release anything
|
|
// currently being held and return.
|
|
_finalisedLineTexture = nil;
|
|
_finalisedLineState = nil;
|
|
_separatedLumaTexture = nil;
|
|
_separatedLumaState = nil;
|
|
_compositionTexture = nil;
|
|
_compositionRenderPass = nil;
|
|
return;
|
|
}
|
|
|
|
// Create a composition texture if one does not yet exist.
|
|
if(!_compositionTexture) {
|
|
lineTextureDescriptor.usage = MTLTextureUsageRenderTarget | MTLTextureUsageShaderRead;
|
|
_compositionTexture = [_view.device newTextureWithDescriptor:lineTextureDescriptor];
|
|
}
|
|
|
|
// Grab the shader library.
|
|
id<MTLLibrary> library = [_view.device newDefaultLibrary];
|
|
lineTextureDescriptor.usage = MTLTextureUsageShaderWrite | MTLTextureUsageShaderRead;
|
|
|
|
// The finalised texture will definitely exist, and may or may not require a gamma conversion when written to.
|
|
if(!_finalisedLineTexture) {
|
|
_finalisedLineTexture = [_view.device newTextureWithDescriptor:lineTextureDescriptor];
|
|
[self clearTexture:_finalisedLineTexture];
|
|
|
|
NSString *const kernelFunction = [self shouldApplyGamma] ? @"filterChromaKernelWithGamma" : @"filterChromaKernelNoGamma";
|
|
_finalisedLineState = [_view.device newComputePipelineStateWithFunction:[library newFunctionWithName:kernelFunction] error:nil];
|
|
}
|
|
|
|
// A luma separation texture will exist only for composite colour.
|
|
if(_pipeline == Pipeline::CompositeColour) {
|
|
if(!_separatedLumaTexture) {
|
|
_separatedLumaTexture = [_view.device newTextureWithDescriptor:lineTextureDescriptor];
|
|
|
|
NSString *kernelFunction;
|
|
switch(_lumaKernelSize) {
|
|
default: kernelFunction = @"separateLumaKernel15"; break;
|
|
case 9: kernelFunction = @"separateLumaKernel9"; break;
|
|
case 7: kernelFunction = @"separateLumaKernel7"; break;
|
|
case 1:
|
|
case 3:
|
|
case 5: kernelFunction = @"separateLumaKernel5"; break;
|
|
}
|
|
|
|
_separatedLumaState = [_view.device newComputePipelineStateWithFunction:[library newFunctionWithName:kernelFunction] error:nil];
|
|
}
|
|
} else {
|
|
_separatedLumaTexture = nil;
|
|
}
|
|
}
|
|
|
|
- (void)setAspectRatio {
|
|
const auto modals = _scanTarget.modals();
|
|
simd::float3x3 sourceToDisplay{1.0f};
|
|
|
|
// The starting coordinate space is [0, 1].
|
|
|
|
// Move the centre of the cropping rectangle to the centre of the display.
|
|
{
|
|
simd::float3x3 recentre{1.0f};
|
|
recentre.columns[2][0] = 0.5f - (modals.visible_area.origin.x + modals.visible_area.size.width * 0.5f);
|
|
recentre.columns[2][1] = 0.5f - (modals.visible_area.origin.y + modals.visible_area.size.height * 0.5f);
|
|
sourceToDisplay = recentre * sourceToDisplay;
|
|
}
|
|
|
|
// Convert from the internal [0, 1] to centred [-1, 1] (i.e. Metal's eye coordinates, though also appropriate
|
|
// for the zooming step that follows).
|
|
{
|
|
simd::float3x3 convertToEye;
|
|
convertToEye.columns[0][0] = 2.0f;
|
|
convertToEye.columns[1][1] = -2.0f;
|
|
convertToEye.columns[2][0] = -1.0f;
|
|
convertToEye.columns[2][1] = 1.0f;
|
|
convertToEye.columns[2][2] = 1.0f;
|
|
sourceToDisplay = convertToEye * sourceToDisplay;
|
|
}
|
|
|
|
// Determine the correct zoom level. This is a combination of (i) the necessary horizontal stretch to produce a proper
|
|
// aspect ratio; and (ii) the necessary zoom from there to either fit the visible area width or height as per a decision
|
|
// on letterboxing or pillarboxing.
|
|
const float aspectRatioStretch = float(modals.aspect_ratio / _viewAspectRatio);
|
|
const float fitWidthZoom = 1.0f / (float(modals.visible_area.size.width) * aspectRatioStretch);
|
|
const float fitHeightZoom = 1.0f / float(modals.visible_area.size.height);
|
|
const float zoom = std::min(fitWidthZoom, fitHeightZoom);
|
|
|
|
// Convert from there to the proper aspect ratio by stretching or compressing width.
|
|
// After this the output is exactly centred, filling the vertical space and being as wide or slender as it likes.
|
|
{
|
|
simd::float3x3 applyAspectRatio{1.0f};
|
|
applyAspectRatio.columns[0][0] = aspectRatioStretch * zoom;
|
|
applyAspectRatio.columns[1][1] = zoom;
|
|
sourceToDisplay = applyAspectRatio * sourceToDisplay;
|
|
}
|
|
|
|
// Store.
|
|
uniforms()->sourcetoDisplay = sourceToDisplay;
|
|
}
|
|
|
|
- (void)setModals:(const Outputs::Display::ScanTarget::Modals &)modals {
|
|
//
|
|
// Populate uniforms.
|
|
//
|
|
uniforms()->scale[0] = modals.output_scale.x;
|
|
uniforms()->scale[1] = modals.output_scale.y;
|
|
uniforms()->lineWidth = 1.05f / modals.expected_vertical_lines;
|
|
[self setAspectRatio];
|
|
|
|
const auto toRGB = to_rgb_matrix(modals.composite_colour_space);
|
|
uniforms()->toRGB = simd::float3x3(
|
|
simd::float3{toRGB[0], toRGB[1], toRGB[2]},
|
|
simd::float3{toRGB[3], toRGB[4], toRGB[5]},
|
|
simd::float3{toRGB[6], toRGB[7], toRGB[8]}
|
|
);
|
|
|
|
const auto fromRGB = from_rgb_matrix(modals.composite_colour_space);
|
|
uniforms()->fromRGB = simd::float3x3(
|
|
simd::float3{fromRGB[0], fromRGB[1], fromRGB[2]},
|
|
simd::float3{fromRGB[3], fromRGB[4], fromRGB[5]},
|
|
simd::float3{fromRGB[6], fromRGB[7], fromRGB[8]}
|
|
);
|
|
|
|
// This is fixed for now; consider making it a function of frame rate and/or of whether frame syncing
|
|
// is ongoing (which would require a way to signal that to this scan target).
|
|
uniforms()->outputAlpha = __fp16(0.64f);
|
|
uniforms()->outputMultiplier = __fp16(modals.brightness);
|
|
|
|
const float displayGamma = 2.2f; // This is assumed.
|
|
uniforms()->outputGamma = __fp16(displayGamma / modals.intended_gamma);
|
|
|
|
|
|
|
|
//
|
|
// Generate input texture.
|
|
//
|
|
MTLPixelFormat pixelFormat;
|
|
_bytesPerInputPixel = size_for_data_type(modals.input_data_type);
|
|
if(data_type_is_normalised(modals.input_data_type)) {
|
|
switch(_bytesPerInputPixel) {
|
|
default:
|
|
case 1: pixelFormat = MTLPixelFormatR8Unorm; break;
|
|
case 2: pixelFormat = MTLPixelFormatRG8Unorm; break;
|
|
case 4: pixelFormat = MTLPixelFormatRGBA8Unorm; break;
|
|
}
|
|
} else {
|
|
switch(_bytesPerInputPixel) {
|
|
default:
|
|
case 1: pixelFormat = MTLPixelFormatR8Uint; break;
|
|
case 2: pixelFormat = MTLPixelFormatRG8Uint; break;
|
|
case 4: pixelFormat = MTLPixelFormatRGBA8Uint; break;
|
|
}
|
|
}
|
|
MTLTextureDescriptor *const textureDescriptor = [MTLTextureDescriptor
|
|
texture2DDescriptorWithPixelFormat:pixelFormat
|
|
width:BufferingScanTarget::WriteAreaWidth
|
|
height:BufferingScanTarget::WriteAreaHeight
|
|
mipmapped:NO];
|
|
textureDescriptor.resourceOptions = SharedResourceOptionsTexture;
|
|
if(@available(macOS 10.14, *)) {
|
|
textureDescriptor.allowGPUOptimizedContents = NO;
|
|
}
|
|
|
|
// TODO: the call below is the only reason why this project now requires macOS 10.13; is it all that helpful versus just uploading each frame?
|
|
const NSUInteger bytesPerRow = BufferingScanTarget::WriteAreaWidth * _bytesPerInputPixel;
|
|
_writeAreaTexture = [_writeAreaBuffer
|
|
newTextureWithDescriptor:textureDescriptor
|
|
offset:0
|
|
bytesPerRow:bytesPerRow];
|
|
_totalTextureBytes = bytesPerRow * BufferingScanTarget::WriteAreaHeight;
|
|
|
|
|
|
|
|
//
|
|
// Generate scan pipeline.
|
|
//
|
|
id<MTLLibrary> library = [_view.device newDefaultLibrary];
|
|
MTLRenderPipelineDescriptor *pipelineDescriptor = [[MTLRenderPipelineDescriptor alloc] init];
|
|
|
|
// Occasions when the composition buffer isn't required are slender: the output must be neither RGB nor composite monochrome.
|
|
const bool isComposition =
|
|
modals.display_type != Outputs::Display::DisplayType::RGB &&
|
|
modals.display_type != Outputs::Display::DisplayType::CompositeMonochrome;
|
|
const bool isSVideoOutput = modals.display_type == Outputs::Display::DisplayType::SVideo;
|
|
|
|
if(!isComposition) {
|
|
_pipeline = Pipeline::DirectToDisplay;
|
|
} else {
|
|
_pipeline = isSVideoOutput ? Pipeline::SVideo : Pipeline::CompositeColour;
|
|
}
|
|
|
|
struct FragmentSamplerDictionary {
|
|
/// Fragment shader that outputs to the composition buffer for composite processing.
|
|
NSString *const compositionComposite;
|
|
/// Fragment shader that outputs to the composition buffer for S-Video processing.
|
|
NSString *const compositionSVideo;
|
|
|
|
/// Fragment shader that outputs directly as monochrome composite.
|
|
NSString *const directComposite;
|
|
/// Fragment shader that outputs directly as monochrome composite, with gamma correction.
|
|
NSString *const directCompositeWithGamma;
|
|
/// Fragment shader that outputs directly as RGB.
|
|
NSString *const directRGB;
|
|
/// Fragment shader that outputs directly as RGB, with gamma correction.
|
|
NSString *const directRGBWithGamma;
|
|
};
|
|
const FragmentSamplerDictionary samplerDictionary[8] = {
|
|
// Composite formats.
|
|
{@"compositeSampleLuminance1", nil, @"sampleLuminance1", @"sampleLuminance1", @"sampleLuminance1", @"sampleLuminance1"},
|
|
{@"compositeSampleLuminance8", nil, @"sampleLuminance8", @"sampleLuminance8WithGamma", @"sampleLuminance8", @"sampleLuminance8WithGamma"},
|
|
{@"compositeSamplePhaseLinkedLuminance8", nil, @"samplePhaseLinkedLuminance8", @"samplePhaseLinkedLuminance8WithGamma", @"samplePhaseLinkedLuminance8", @"samplePhaseLinkedLuminance8WithGamma"},
|
|
|
|
// S-Video formats.
|
|
{@"compositeSampleLuminance8Phase8", @"sampleLuminance8Phase8", @"directCompositeSampleLuminance8Phase8", @"directCompositeSampleLuminance8Phase8WithGamma", @"directCompositeSampleLuminance8Phase8", @"directCompositeSampleLuminance8Phase8WithGamma"},
|
|
|
|
// RGB formats.
|
|
{@"compositeSampleRed1Green1Blue1", @"svideoSampleRed1Green1Blue1", @"directCompositeSampleRed1Green1Blue1", @"directCompositeSampleRed1Green1Blue1WithGamma", @"sampleRed1Green1Blue1", @"sampleRed1Green1Blue1"},
|
|
{@"compositeSampleRed2Green2Blue2", @"svideoSampleRed2Green2Blue2", @"directCompositeSampleRed2Green2Blue2", @"directCompositeSampleRed2Green2Blue2WithGamma", @"sampleRed2Green2Blue2", @"sampleRed2Green2Blue2WithGamma"},
|
|
{@"compositeSampleRed4Green4Blue4", @"svideoSampleRed4Green4Blue4", @"directCompositeSampleRed4Green4Blue4", @"directCompositeSampleRed4Green4Blue4WithGamma", @"sampleRed4Green4Blue4", @"sampleRed4Green4Blue4WithGamma"},
|
|
{@"compositeSampleRed8Green8Blue8", @"svideoSampleRed8Green8Blue8", @"directCompositeSampleRed8Green8Blue8", @"directCompositeSampleRed8Green8Blue8WithGamma", @"sampleRed8Green8Blue8", @"sampleRed8Green8Blue8WithGamma"},
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
// Do a quick check that all the shaders named above are defined in the Metal code. I don't think this is possible at compile time.
|
|
for(int c = 0; c < 8; ++c) {
|
|
#define Test(x) if(samplerDictionary[c].x) assert([library newFunctionWithName:samplerDictionary[c].x]);
|
|
Test(compositionComposite);
|
|
Test(compositionSVideo);
|
|
Test(directComposite);
|
|
Test(directCompositeWithGamma);
|
|
Test(directRGB);
|
|
Test(directRGBWithGamma);
|
|
#undef Test
|
|
}
|
|
#endif
|
|
|
|
uniforms()->cyclesMultiplier = 1.0f;
|
|
if(_pipeline != Pipeline::DirectToDisplay) {
|
|
// Pick a suitable cycle multiplier.
|
|
const float minimumSize = 4.0f * float(modals.colour_cycle_numerator) / float(modals.colour_cycle_denominator);
|
|
while(uniforms()->cyclesMultiplier * modals.cycles_per_line < minimumSize) {
|
|
uniforms()->cyclesMultiplier += 1.0f;
|
|
|
|
if(uniforms()->cyclesMultiplier * modals.cycles_per_line > 2048) {
|
|
uniforms()->cyclesMultiplier -= 1.0f;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Create suitable filters.
|
|
_lineBufferPixelsPerLine = NSUInteger(modals.cycles_per_line) * NSUInteger(uniforms()->cyclesMultiplier);
|
|
const float colourCyclesPerLine = float(modals.colour_cycle_numerator) / float(modals.colour_cycle_denominator);
|
|
|
|
// Compute radians per pixel.
|
|
const float radiansPerPixel = (colourCyclesPerLine * 3.141592654f * 2.0f) / float(_lineBufferPixelsPerLine);
|
|
|
|
// Generate the chrominance filter.
|
|
{
|
|
simd::float3 firCoefficients[8];
|
|
const auto chromaCoefficients = boxCoefficients(radiansPerPixel, 3.141592654f);
|
|
_chromaKernelSize = 15;
|
|
for(size_t c = 0; c < 8; ++c) {
|
|
firCoefficients[c].y = firCoefficients[c].z = (isSVideoOutput ? 2.0f : 1.0f) * chromaCoefficients[c];
|
|
firCoefficients[c].x = 0.0f;
|
|
if(fabsf(chromaCoefficients[c]) < 0.01f) {
|
|
_chromaKernelSize -= 2;
|
|
}
|
|
}
|
|
firCoefficients[7].x = 1.0f;
|
|
|
|
// Luminance will be very soft as a result of the separation phase; apply a sharpen filter to try to undo that.
|
|
// This is applied separately because the first composite processing step is going to select between the nominal
|
|
// chroma and luma parts to take the place of luminance depending on whether a colour burst was found, and high-pass
|
|
// filtering the chrominance channel would be visually detrimental.
|
|
//
|
|
// The low cut off ['Hz' but per line, not per second] is somewhat arbitrary.
|
|
if(!isSVideoOutput) {
|
|
SignalProcessing::FIRFilter sharpenFilter(15, float(_lineBufferPixelsPerLine), 40.0f, colourCyclesPerLine);
|
|
const auto sharpen = sharpenFilter.get_coefficients();
|
|
size_t sharpenFilterSize = 15;
|
|
bool isStart = true;
|
|
for(size_t c = 0; c < 8; ++c) {
|
|
firCoefficients[c].x = sharpen[c];
|
|
if(fabsf(sharpen[c]) > 0.01f) isStart = false;
|
|
if(isStart) sharpenFilterSize -= 2;
|
|
}
|
|
_chromaKernelSize = std::max(_chromaKernelSize, sharpenFilterSize);
|
|
}
|
|
|
|
// Convert to half-size floats.
|
|
for(size_t c = 0; c < 8; ++c) {
|
|
uniforms()->chromaKernel[c] = firCoefficients[c];
|
|
}
|
|
}
|
|
|
|
// Generate the luminance separation filter and determine its required size.
|
|
{
|
|
auto *const filter = uniforms()->lumaKernel;
|
|
const auto coefficients = boxCoefficients(radiansPerPixel, 3.141592654f);
|
|
_lumaKernelSize = 15;
|
|
for(size_t c = 0; c < 8; ++c) {
|
|
filter[c] = __fp16(coefficients[c]);
|
|
if(fabsf(coefficients[c]) < 0.01f) {
|
|
_lumaKernelSize -= 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update intermediate storage.
|
|
[self updateModalBuffers];
|
|
|
|
if(_pipeline != Pipeline::DirectToDisplay) {
|
|
// Create the composition render pass.
|
|
pipelineDescriptor.colorAttachments[0].pixelFormat = _compositionTexture.pixelFormat;
|
|
pipelineDescriptor.vertexFunction = [library newFunctionWithName:@"scanToComposition"];
|
|
pipelineDescriptor.fragmentFunction =
|
|
[library newFunctionWithName:isSVideoOutput ? samplerDictionary[int(modals.input_data_type)].compositionSVideo : samplerDictionary[int(modals.input_data_type)].compositionComposite];
|
|
|
|
_composePipeline = [_view.device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:nil];
|
|
|
|
_compositionRenderPass = [[MTLRenderPassDescriptor alloc] init];
|
|
_compositionRenderPass.colorAttachments[0].texture = _compositionTexture;
|
|
_compositionRenderPass.colorAttachments[0].loadAction = MTLLoadActionClear;
|
|
_compositionRenderPass.colorAttachments[0].storeAction = MTLStoreActionStore;
|
|
_compositionRenderPass.colorAttachments[0].clearColor = MTLClearColorMake(0.0, 0.5, 0.5, 0.3);
|
|
}
|
|
|
|
// Build the output pipeline.
|
|
pipelineDescriptor.colorAttachments[0].pixelFormat = _view.colorPixelFormat;
|
|
pipelineDescriptor.vertexFunction = [library newFunctionWithName:_pipeline == Pipeline::DirectToDisplay ? @"scanToDisplay" : @"lineToDisplay"];
|
|
|
|
if(_pipeline != Pipeline::DirectToDisplay) {
|
|
pipelineDescriptor.fragmentFunction = [library newFunctionWithName:@"interpolateFragment"];
|
|
} else {
|
|
const bool isRGBOutput = modals.display_type == Outputs::Display::DisplayType::RGB;
|
|
|
|
NSString *shaderName;
|
|
if(isRGBOutput) {
|
|
shaderName = [self shouldApplyGamma] ? samplerDictionary[int(modals.input_data_type)].directRGBWithGamma : samplerDictionary[int(modals.input_data_type)].directRGB;
|
|
} else {
|
|
shaderName = [self shouldApplyGamma] ? samplerDictionary[int(modals.input_data_type)].directCompositeWithGamma : samplerDictionary[int(modals.input_data_type)].directComposite;
|
|
}
|
|
pipelineDescriptor.fragmentFunction = [library newFunctionWithName:shaderName];
|
|
}
|
|
|
|
// Enable blending.
|
|
pipelineDescriptor.colorAttachments[0].blendingEnabled = YES;
|
|
pipelineDescriptor.colorAttachments[0].sourceRGBBlendFactor = MTLBlendFactorSourceAlpha;
|
|
pipelineDescriptor.colorAttachments[0].destinationRGBBlendFactor = MTLBlendFactorOneMinusSourceAlpha;
|
|
|
|
// Set stencil format.
|
|
pipelineDescriptor.stencilAttachmentPixelFormat = MTLPixelFormatStencil8;
|
|
|
|
// Finish.
|
|
_outputPipeline = [_view.device newRenderPipelineStateWithDescriptor:pipelineDescriptor error:nil];
|
|
}
|
|
|
|
- (void)outputFrom:(size_t)start to:(size_t)end commandBuffer:(id<MTLCommandBuffer>)commandBuffer {
|
|
if(start == end) return;
|
|
|
|
// Generate a command encoder for the view.
|
|
id<MTLRenderCommandEncoder> encoder = [commandBuffer renderCommandEncoderWithDescriptor:_frameBufferRenderPass];
|
|
|
|
// Final output. Could be scans or lines.
|
|
[encoder setRenderPipelineState:_outputPipeline];
|
|
|
|
if(_pipeline != Pipeline::DirectToDisplay) {
|
|
[encoder setFragmentTexture:_finalisedLineTexture atIndex:0];
|
|
[encoder setVertexBuffer:_linesBuffer offset:0 atIndex:0];
|
|
} else {
|
|
[encoder setFragmentTexture:_writeAreaTexture atIndex:0];
|
|
[encoder setVertexBuffer:_scansBuffer offset:0 atIndex:0];
|
|
}
|
|
[encoder setVertexBuffer:_uniformsBuffer offset:0 atIndex:1];
|
|
[encoder setFragmentBuffer:_uniformsBuffer offset:0 atIndex:0];
|
|
|
|
[encoder setDepthStencilState:_drawStencilState];
|
|
[encoder setStencilReferenceValue:1];
|
|
#ifndef NDEBUG
|
|
// Quick aid for debugging: the stencil test is predicated on front-facing pixels, so make sure they're
|
|
// being generated.
|
|
[encoder setCullMode:MTLCullModeBack];
|
|
#endif
|
|
|
|
#define OutputStrips(start, size) [encoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:0 vertexCount:4 instanceCount:size baseInstance:start]
|
|
RangePerform(start, end, _pipeline != Pipeline::DirectToDisplay ? NumBufferedLines : NumBufferedScans, OutputStrips);
|
|
#undef OutputStrips
|
|
|
|
// Complete encoding.
|
|
[encoder endEncoding];
|
|
}
|
|
|
|
- (void)outputFrameCleanerToCommandBuffer:(id<MTLCommandBuffer>)commandBuffer {
|
|
// Generate a command encoder for the view.
|
|
id<MTLRenderCommandEncoder> encoder = [commandBuffer renderCommandEncoderWithDescriptor:_frameBufferRenderPass];
|
|
|
|
[encoder setRenderPipelineState:_clearPipeline];
|
|
[encoder setDepthStencilState:_clearStencilState];
|
|
[encoder setStencilReferenceValue:0];
|
|
|
|
[encoder setVertexTexture:_frameBuffer atIndex:0];
|
|
[encoder setFragmentTexture:_frameBuffer atIndex:0];
|
|
[encoder setFragmentBuffer:_uniformsBuffer offset:0 atIndex:0];
|
|
|
|
[encoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:0 vertexCount:4];
|
|
[encoder endEncoding];
|
|
}
|
|
|
|
- (void)composeOutputArea:(const BufferingScanTarget::OutputArea &)outputArea commandBuffer:(id<MTLCommandBuffer>)commandBuffer {
|
|
// Output all scans to the composition buffer.
|
|
const id<MTLRenderCommandEncoder> encoder = [commandBuffer renderCommandEncoderWithDescriptor:_compositionRenderPass];
|
|
[encoder setRenderPipelineState:_composePipeline];
|
|
|
|
[encoder setVertexBuffer:_scansBuffer offset:0 atIndex:0];
|
|
[encoder setVertexBuffer:_uniformsBuffer offset:0 atIndex:1];
|
|
[encoder setVertexTexture:_compositionTexture atIndex:0];
|
|
|
|
[encoder setFragmentBuffer:_uniformsBuffer offset:0 atIndex:0];
|
|
[encoder setFragmentTexture:_writeAreaTexture atIndex:0];
|
|
|
|
#define OutputScans(start, size) [encoder drawPrimitives:MTLPrimitiveTypeLine vertexStart:0 vertexCount:2 instanceCount:size baseInstance:start]
|
|
RangePerform(outputArea.start.scan, outputArea.end.scan, NumBufferedScans, OutputScans);
|
|
#undef OutputScans
|
|
[encoder endEncoding];
|
|
}
|
|
|
|
- (id<MTLBuffer>)bufferForOffset:(size_t)offset {
|
|
// Store and apply the offset.
|
|
const auto buffer = _lineOffsetBuffers[_lineOffsetBuffer];
|
|
*(reinterpret_cast<int *>(_lineOffsetBuffers[_lineOffsetBuffer].contents)) = int(offset);
|
|
_lineOffsetBuffer = (_lineOffsetBuffer + 1) % NumBufferedLines;
|
|
return buffer;
|
|
}
|
|
|
|
- (void)dispatchComputeCommandEncoder:(id<MTLComputeCommandEncoder>)encoder pipelineState:(id<MTLComputePipelineState>)pipelineState width:(NSUInteger)width height:(NSUInteger)height offsetBuffer:(id<MTLBuffer>)offsetBuffer {
|
|
[encoder setBuffer:offsetBuffer offset:0 atIndex:1];
|
|
|
|
// This follows the recommendations at https://developer.apple.com/documentation/metal/calculating_threadgroup_and_grid_sizes ;
|
|
// I currently have no independent opinion whatsoever.
|
|
const MTLSize threadsPerThreadgroup = MTLSizeMake(
|
|
pipelineState.threadExecutionWidth,
|
|
pipelineState.maxTotalThreadsPerThreadgroup / pipelineState.threadExecutionWidth,
|
|
1
|
|
);
|
|
const MTLSize threadsPerGrid = MTLSizeMake(width, height, 1);
|
|
|
|
// Set the pipeline state and dispatch the drawing. Which may slightly overdraw.
|
|
[encoder setComputePipelineState:pipelineState];
|
|
[encoder dispatchThreads:threadsPerGrid threadsPerThreadgroup:threadsPerThreadgroup];
|
|
}
|
|
|
|
- (void)updateFrameBuffer {
|
|
// TODO: rethink BufferingScanTarget::perform. Is it now really just for guarding the modals?
|
|
_scanTarget.perform([=] {
|
|
const Outputs::Display::ScanTarget::Modals *const newModals = _scanTarget.new_modals();
|
|
if(newModals) {
|
|
[self setModals:*newModals];
|
|
}
|
|
});
|
|
|
|
@synchronized(self) {
|
|
if(!_frameBufferRenderPass) return;
|
|
|
|
const auto outputArea = _scanTarget.get_output_area();
|
|
|
|
if(outputArea.end.line != outputArea.start.line) {
|
|
|
|
// Ensure texture changes are noted.
|
|
const auto writeAreaModificationStart = size_t(outputArea.start.write_area_x + outputArea.start.write_area_y * 2048) * _bytesPerInputPixel;
|
|
const auto writeAreaModificationEnd = size_t(outputArea.end.write_area_x + outputArea.end.write_area_y * 2048) * _bytesPerInputPixel;
|
|
#define FlushRegion(start, size) [_writeAreaBuffer didModifyRange:NSMakeRange(start, size)]
|
|
RangePerform(writeAreaModificationStart, writeAreaModificationEnd, _totalTextureBytes, FlushRegion);
|
|
#undef FlushRegion
|
|
|
|
// Obtain a source for render command encoders.
|
|
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
|
|
|
|
//
|
|
// Drawing algorithm used below, in broad terms:
|
|
//
|
|
// Maintain a persistent buffer of current CRT state.
|
|
//
|
|
// During each frame, paint to the persistent buffer anything new. Update a stencil buffer to track
|
|
// every pixel so-far touched.
|
|
//
|
|
// At the end of the frame, draw a 'frame cleaner', which is a whole-screen rect that paints over
|
|
// only those areas that the stencil buffer indicates weren't painted this frame.
|
|
//
|
|
// Hence every pixel is touched every frame, regardless of the machine's output.
|
|
//
|
|
|
|
switch(_pipeline) {
|
|
case Pipeline::DirectToDisplay: {
|
|
// Output scans directly, broken up by frame.
|
|
size_t line = outputArea.start.line;
|
|
size_t scan = outputArea.start.scan;
|
|
while(line != outputArea.end.line) {
|
|
if(_lineMetadataBuffer[line].is_first_in_frame) {
|
|
[self outputFrom:scan to:_lineMetadataBuffer[line].first_scan commandBuffer:commandBuffer];
|
|
scan = _lineMetadataBuffer[line].first_scan;
|
|
|
|
if(_lineMetadataBuffer[line].previous_frame_was_complete && !_dontClearFrameBuffer) {
|
|
[self outputFrameCleanerToCommandBuffer:commandBuffer];
|
|
}
|
|
_dontClearFrameBuffer = NO;
|
|
}
|
|
line = (line + 1) % NumBufferedLines;
|
|
}
|
|
[self outputFrom:scan to:outputArea.end.scan commandBuffer:commandBuffer];
|
|
} break;
|
|
|
|
case Pipeline::CompositeColour:
|
|
case Pipeline::SVideo: {
|
|
// Build the composition buffer.
|
|
[self composeOutputArea:outputArea commandBuffer:commandBuffer];
|
|
|
|
if(_pipeline == Pipeline::SVideo) {
|
|
// Filter from composition to the finalised line texture.
|
|
id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
|
|
[computeEncoder setTexture:_compositionTexture atIndex:0];
|
|
[computeEncoder setTexture:_finalisedLineTexture atIndex:1];
|
|
[computeEncoder setBuffer:_uniformsBuffer offset:0 atIndex:0];
|
|
|
|
if(outputArea.end.line > outputArea.start.line) {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:outputArea.end.line - outputArea.start.line offsetBuffer:[self bufferForOffset:outputArea.start.line]];
|
|
} else {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:NumBufferedLines - outputArea.start.line offsetBuffer:[self bufferForOffset:outputArea.start.line]];
|
|
if(outputArea.end.line) {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:outputArea.end.line offsetBuffer:[self bufferForOffset:0]];
|
|
}
|
|
}
|
|
|
|
[computeEncoder endEncoding];
|
|
} else {
|
|
// Separate luminance.
|
|
id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
|
|
[computeEncoder setTexture:_compositionTexture atIndex:0];
|
|
[computeEncoder setTexture:_separatedLumaTexture atIndex:1];
|
|
[computeEncoder setBuffer:_uniformsBuffer offset:0 atIndex:0];
|
|
|
|
__unsafe_unretained id<MTLBuffer> offsetBuffers[2] = {nil, nil};
|
|
offsetBuffers[0] = [self bufferForOffset:outputArea.start.line];
|
|
|
|
if(outputArea.end.line > outputArea.start.line) {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_separatedLumaState width:_lineBufferPixelsPerLine height:outputArea.end.line - outputArea.start.line offsetBuffer:offsetBuffers[0]];
|
|
} else {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_separatedLumaState width:_lineBufferPixelsPerLine height:NumBufferedLines - outputArea.start.line offsetBuffer:offsetBuffers[0]];
|
|
if(outputArea.end.line) {
|
|
offsetBuffers[1] = [self bufferForOffset:0];
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_separatedLumaState width:_lineBufferPixelsPerLine height:outputArea.end.line offsetBuffer:offsetBuffers[1]];
|
|
}
|
|
}
|
|
|
|
// Filter resulting chrominance.
|
|
[computeEncoder setTexture:_separatedLumaTexture atIndex:0];
|
|
[computeEncoder setTexture:_finalisedLineTexture atIndex:1];
|
|
[computeEncoder setBuffer:_uniformsBuffer offset:0 atIndex:0];
|
|
|
|
if(outputArea.end.line > outputArea.start.line) {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:outputArea.end.line - outputArea.start.line offsetBuffer:offsetBuffers[0]];
|
|
} else {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:NumBufferedLines - outputArea.start.line offsetBuffer:offsetBuffers[0]];
|
|
if(outputArea.end.line) {
|
|
[self dispatchComputeCommandEncoder:computeEncoder pipelineState:_finalisedLineState width:_lineBufferPixelsPerLine height:outputArea.end.line offsetBuffer:offsetBuffers[1]];
|
|
}
|
|
}
|
|
|
|
[computeEncoder endEncoding];
|
|
}
|
|
|
|
// Output lines, broken up by frame.
|
|
size_t startLine = outputArea.start.line;
|
|
size_t line = outputArea.start.line;
|
|
while(line != outputArea.end.line) {
|
|
if(_lineMetadataBuffer[line].is_first_in_frame) {
|
|
[self outputFrom:startLine to:line commandBuffer:commandBuffer];
|
|
startLine = line;
|
|
|
|
if(_lineMetadataBuffer[line].previous_frame_was_complete && !_dontClearFrameBuffer) {
|
|
[self outputFrameCleanerToCommandBuffer:commandBuffer];
|
|
}
|
|
_dontClearFrameBuffer = NO;
|
|
}
|
|
line = (line + 1) % NumBufferedLines;
|
|
}
|
|
[self outputFrom:startLine to:outputArea.end.line commandBuffer:commandBuffer];
|
|
} break;
|
|
}
|
|
|
|
// Add a callback to update the scan target buffer and commit the drawing.
|
|
[commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> _Nonnull) {
|
|
self->_scanTarget.complete_output_area(outputArea);
|
|
}];
|
|
[commandBuffer commit];
|
|
} else {
|
|
// There was no work, but to be contractually correct, remember to announce completion,
|
|
// and do it after finishing an empty command queue, as a cheap way to ensure this doen't
|
|
// front run any actual processing. TODO: can I do a better job of that?
|
|
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
|
|
[commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> _Nonnull) {
|
|
self->_scanTarget.complete_output_area(outputArea);
|
|
}];
|
|
[commandBuffer commit];
|
|
|
|
// TODO: reenable these and work out how on earth the Master System + Alex Kidd (US) is managing
|
|
// to provide write_area_y = 0, start_x = 0, end_x = 1.
|
|
// assert(outputArea.end.line == outputArea.start.line);
|
|
// assert(outputArea.end.scan == outputArea.start.scan);
|
|
// assert(outputArea.end.write_area_y == outputArea.start.write_area_y);
|
|
// assert(outputArea.end.write_area_x == outputArea.start.write_area_x);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*!
|
|
@method drawInMTKView:
|
|
@abstract Called on the delegate when it is asked to render into the view
|
|
@discussion Called on the delegate when it is asked to render into the view
|
|
*/
|
|
- (void)drawInMTKView:(nonnull MTKView *)view {
|
|
if(_isDrawing.test_and_set()) {
|
|
_scanTarget.display_metrics_.announce_draw_status(false);
|
|
return;
|
|
}
|
|
|
|
// Disable supersampling if performance requires it.
|
|
if(_isUsingSupersampling && _scanTarget.display_metrics_.should_lower_resolution()) {
|
|
_isUsingSupersampling = false;
|
|
[self updateSizeBuffers];
|
|
}
|
|
|
|
// Schedule a copy from the current framebuffer to the view; blitting is unavailable as the target is a framebuffer texture.
|
|
id<MTLCommandBuffer> commandBuffer = [_commandQueue commandBuffer];
|
|
|
|
// Every pixel will be drawn, so don't clear or reload.
|
|
view.currentRenderPassDescriptor.colorAttachments[0].loadAction = MTLLoadActionDontCare;
|
|
id<MTLRenderCommandEncoder> encoder = [commandBuffer renderCommandEncoderWithDescriptor:view.currentRenderPassDescriptor];
|
|
|
|
[encoder setRenderPipelineState:_isUsingSupersampling ? _supersamplePipeline : _copyPipeline];
|
|
[encoder setVertexTexture:_frameBuffer atIndex:0];
|
|
[encoder setFragmentTexture:_frameBuffer atIndex:0];
|
|
|
|
[encoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:0 vertexCount:4];
|
|
[encoder endEncoding];
|
|
|
|
[commandBuffer presentDrawable:view.currentDrawable];
|
|
[commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> _Nonnull) {
|
|
self->_isDrawing.clear();
|
|
self->_scanTarget.display_metrics_.announce_draw_status(true);
|
|
}];
|
|
[commandBuffer commit];
|
|
}
|
|
|
|
- (Outputs::Display::ScanTarget *)scanTarget {
|
|
return &_scanTarget;
|
|
}
|
|
|
|
- (void)willChangeOwner {
|
|
self.scanTarget->will_change_owner();
|
|
}
|
|
|
|
- (NSBitmapImageRep *)imageRepresentation {
|
|
// Create an NSBitmapRep as somewhere to copy pixel data to.
|
|
NSBitmapImageRep *const result =
|
|
[[NSBitmapImageRep alloc]
|
|
initWithBitmapDataPlanes:NULL
|
|
pixelsWide:(NSInteger)_frameBuffer.width
|
|
pixelsHigh:(NSInteger)_frameBuffer.height
|
|
bitsPerSample:8
|
|
samplesPerPixel:4
|
|
hasAlpha:YES
|
|
isPlanar:NO
|
|
colorSpaceName:NSDeviceRGBColorSpace
|
|
bytesPerRow:4 * (NSInteger)_frameBuffer.width
|
|
bitsPerPixel:0];
|
|
|
|
// Create a CPU-accessible texture and copy the current contents of the _frameBuffer to it.
|
|
// TODO: supersample rather than directly copy if appropriate?
|
|
id<MTLTexture> cpuTexture;
|
|
MTLTextureDescriptor *const textureDescriptor = [MTLTextureDescriptor
|
|
texture2DDescriptorWithPixelFormat:_view.colorPixelFormat
|
|
width:_frameBuffer.width
|
|
height:_frameBuffer.height
|
|
mipmapped:NO];
|
|
textureDescriptor.usage = MTLTextureUsageRenderTarget | MTLTextureUsageShaderRead;
|
|
textureDescriptor.resourceOptions = MTLResourceStorageModeManaged;
|
|
cpuTexture = [_view.device newTextureWithDescriptor:textureDescriptor];
|
|
[[self copyTexture:_frameBuffer to:cpuTexture] waitUntilCompleted];
|
|
|
|
// Copy from the CPU-visible texture to the bitmap image representation.
|
|
uint8_t *const bitmapData = result.bitmapData;
|
|
[cpuTexture
|
|
getBytes:bitmapData
|
|
bytesPerRow:_frameBuffer.width*4
|
|
fromRegion:MTLRegionMake2D(0, 0, _frameBuffer.width, _frameBuffer.height)
|
|
mipmapLevel:0];
|
|
|
|
// Set alpha to fully opaque and do some byte shuffling if necessary;
|
|
// Apple likes BGR for output but RGB is the best I can specify to NSBitmapImageRep.
|
|
//
|
|
// I'm not putting my foot down and having the GPU do the conversion I want
|
|
// because this lets me reuse _copyPipeline and thereby cut down on boilerplate,
|
|
// especially given that screenshots are not a bottleneck.
|
|
const NSUInteger totalBytes = _frameBuffer.width * _frameBuffer.height * 4;
|
|
const bool flipRedBlue = _view.colorPixelFormat == MTLPixelFormatBGRA8Unorm;
|
|
for(NSUInteger offset = 0; offset < totalBytes; offset += 4) {
|
|
if(flipRedBlue) {
|
|
const uint8_t red = bitmapData[offset];
|
|
bitmapData[offset] = bitmapData[offset+2];
|
|
bitmapData[offset+2] = red;
|
|
}
|
|
bitmapData[offset+3] = 0xff;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
@end
|