mirror of
https://github.com/depp/syncfiles.git
synced 2025-01-20 07:31:38 +00:00
363 lines
8.2 KiB
C
363 lines
8.2 KiB
C
// Copyright 2022 Dietrich Epp.
|
|
// This file is part of SyncFiles. SyncFiles is licensed under the terms of the
|
|
// Mozilla Public License, version 2.0. See LICENSE.txt for details.
|
|
|
|
// convert_1r.c - Reverse conversion from UTF-8 to extended ASCII.
|
|
#include "convert/convert.h"
|
|
#include "lib/defs.h"
|
|
|
|
enum {
|
|
// Maximum length of encoded character.
|
|
kMaxEncodedLength = 8,
|
|
|
|
// Initial number of nodes to allocate when building the tree.
|
|
kInitialTableAlloc = 8
|
|
};
|
|
|
|
struct TEntry {
|
|
// The output character, or zero if no output.
|
|
UInt8 output;
|
|
// The next node, or zero if no next node.
|
|
UInt8 next;
|
|
};
|
|
|
|
// A node for building the converter.
|
|
struct TNode {
|
|
struct TEntry entries[256];
|
|
};
|
|
|
|
struct TTree {
|
|
struct TNode **nodes;
|
|
int count;
|
|
};
|
|
|
|
static ErrorCode CreateTree(struct TTree *tree, Handle data, Size datasz)
|
|
{
|
|
struct TNode **nodes, *node;
|
|
int i, j, dpos, enclen, encend, state, cur, nodecount, nodealloc;
|
|
unsigned ch;
|
|
|
|
// Create a tree with a root node mapping all the ASCII characters except
|
|
// NUL, CR, and LF. NUL won't map because an output of 0 is interpreted as
|
|
// no output. CR and LF are removed so they can be handled specially be the
|
|
// decoder.
|
|
nodes =
|
|
(struct TNode **)NewHandle(kInitialTableAlloc * sizeof(struct TNode));
|
|
if (nodes == NULL) {
|
|
return kErrorNoMemory;
|
|
}
|
|
nodecount = 1;
|
|
nodealloc = kInitialTableAlloc;
|
|
node = *nodes;
|
|
MemClear(node, sizeof(struct TNode));
|
|
for (i = 0; i < 128; i++) {
|
|
node->entries[i].output = i;
|
|
}
|
|
node->entries[kCharLF].output = 0;
|
|
node->entries[kCharCR].output = 0;
|
|
|
|
// Parse the table data and build up a tree of TNode.
|
|
dpos = 1;
|
|
// For each high character (128..255).
|
|
for (i = 0; i < 128; i++) {
|
|
// For each encoding of that character.
|
|
for (j = 0; j < 2; j++) {
|
|
if (dpos >= datasz) {
|
|
goto bad_table;
|
|
}
|
|
enclen = (UInt8)(*data)[dpos++];
|
|
if (enclen != 0) {
|
|
if (enclen < 2 || enclen > datasz - dpos ||
|
|
enclen > kMaxEncodedLength) {
|
|
goto bad_table;
|
|
}
|
|
// Iterate over all but last byte in encoding, to find the node
|
|
// which will produce the decoded byte as output.
|
|
state = 0;
|
|
node = *nodes;
|
|
for (encend = dpos + enclen - 1; dpos < encend; dpos++) {
|
|
ch = (UInt8)(*data)[dpos];
|
|
cur = state;
|
|
state = node->entries[ch].next;
|
|
if (state == 0) {
|
|
if (nodecount >= nodealloc) {
|
|
nodealloc *= 2;
|
|
if (!ResizeHandle(
|
|
(Handle)nodes,
|
|
nodealloc * sizeof(struct TNode))) {
|
|
DisposeHandle((Handle)nodes);
|
|
return kErrorNoMemory;
|
|
}
|
|
node = *nodes + cur;
|
|
}
|
|
state = nodecount++;
|
|
node->entries[ch].next = state;
|
|
node = (*nodes) + state;
|
|
MemClear(node, sizeof(*node));
|
|
} else {
|
|
node = *nodes + state;
|
|
}
|
|
}
|
|
ch = (UInt8)(*data)[dpos++];
|
|
if (node->entries[ch].output != 0) {
|
|
goto bad_table;
|
|
}
|
|
node->entries[ch].output = i | 0x80;
|
|
}
|
|
}
|
|
}
|
|
if (!ResizeHandle((Handle)nodes, nodecount * sizeof(struct TNode))) {
|
|
DisposeHandle((Handle)nodes);
|
|
return kErrorNoMemory;
|
|
}
|
|
tree->nodes = nodes;
|
|
tree->count = nodecount;
|
|
return 0;
|
|
|
|
bad_table:
|
|
DisposeHandle((Handle)nodes);
|
|
return kErrorBadData;
|
|
}
|
|
|
|
struct NodeInfo {
|
|
UInt8 min;
|
|
UInt8 max;
|
|
UInt16 offset;
|
|
};
|
|
|
|
struct CEntry {
|
|
UInt16 output;
|
|
UInt16 next;
|
|
};
|
|
|
|
// A compressed table node. Followed by an array of centry.
|
|
struct CNode {
|
|
// First byte in table.
|
|
UInt8 base;
|
|
// Number of entries in table, minus one.
|
|
UInt8 span;
|
|
};
|
|
|
|
static ErrorCode CompactTree(Handle *out, struct TNode **nodes, int nodecount)
|
|
{
|
|
Handle ctree;
|
|
struct TNode *node;
|
|
struct NodeInfo **infos, *info;
|
|
struct CNode *cnode;
|
|
struct CEntry *centry;
|
|
int i, j, min, max, count, next;
|
|
unsigned offset;
|
|
|
|
// Figure out where each compacted node will go.
|
|
infos = (struct NodeInfo **)NewHandle(sizeof(struct NodeInfo) * nodecount);
|
|
if (infos == NULL) {
|
|
return kErrorNoMemory;
|
|
}
|
|
offset = 0;
|
|
for (i = 0; i < nodecount; i++) {
|
|
node = *nodes + i;
|
|
min = 0;
|
|
while (node->entries[min].output == 0 && node->entries[min].next == 0) {
|
|
min++;
|
|
}
|
|
max = 255;
|
|
while (node->entries[max].output == 0 && node->entries[max].next == 0) {
|
|
max--;
|
|
}
|
|
info = *infos + i;
|
|
info->min = min;
|
|
info->max = max;
|
|
info->offset = offset;
|
|
count = max - min + 1;
|
|
offset += sizeof(struct CNode) + count * sizeof(struct CEntry);
|
|
}
|
|
|
|
// Create the compacted tree.
|
|
ctree = NewHandle(offset);
|
|
if (ctree == NULL) {
|
|
DisposeHandle((Handle)infos);
|
|
return kErrorNoMemory;
|
|
}
|
|
for (i = 0; i < nodecount; i++) {
|
|
node = *nodes + i;
|
|
info = *infos + i;
|
|
min = info->min;
|
|
max = info->max;
|
|
offset = info->offset;
|
|
cnode = (void *)(*ctree + offset);
|
|
cnode->base = min;
|
|
cnode->span = max - min;
|
|
centry = (void *)(*ctree + offset + sizeof(struct CNode));
|
|
for (j = min; j <= max; j++) {
|
|
centry->output = node->entries[j].output;
|
|
next = node->entries[j].next;
|
|
if (next != 0) {
|
|
next = (*infos)[next].offset;
|
|
}
|
|
centry->next = next;
|
|
centry++;
|
|
}
|
|
}
|
|
|
|
DisposeHandle((Handle)infos);
|
|
*out = ctree;
|
|
return 0;
|
|
}
|
|
|
|
ErrorCode Convert1rBuild(Handle *out, Handle data, Size datasz)
|
|
{
|
|
struct TTree table;
|
|
ErrorCode err;
|
|
|
|
err = CreateTree(&table, data, datasz);
|
|
if (err != 0) {
|
|
return err;
|
|
}
|
|
err = CompactTree(out, table.nodes, table.count);
|
|
DisposeHandle((Handle)table.nodes);
|
|
return err;
|
|
}
|
|
|
|
struct Convert1rState {
|
|
UInt8 lastch;
|
|
UInt8 output;
|
|
UInt16 tableoffset;
|
|
};
|
|
|
|
void Convert1rRun(const void *cvtptr, LineBreakConversion lc,
|
|
struct ConverterState *stateptr, UInt8 **optr, UInt8 *oend,
|
|
const UInt8 **iptr, const UInt8 *iend)
|
|
{
|
|
struct Convert1rState *state = (struct Convert1rState *)stateptr;
|
|
const struct CNode *node;
|
|
const struct CEntry *entry;
|
|
UInt8 *opos = *optr;
|
|
const UInt8 *ipos = *iptr, *savein;
|
|
unsigned ch, lastch, chlen, output, saveout, toffset, savetoffset;
|
|
|
|
ch = state->lastch;
|
|
savein = ipos;
|
|
saveout = state->output;
|
|
toffset = state->tableoffset;
|
|
savetoffset = toffset;
|
|
if (oend - opos < 2) {
|
|
goto done;
|
|
}
|
|
goto resume;
|
|
|
|
next_out:
|
|
if (oend - opos < 2) {
|
|
goto done;
|
|
}
|
|
|
|
// Follow state machine to the end.
|
|
savein = ipos;
|
|
saveout = 0;
|
|
toffset = 0;
|
|
savetoffset = 0;
|
|
resume:
|
|
for (;;) {
|
|
if (ipos >= iend) {
|
|
goto done;
|
|
}
|
|
lastch = ch;
|
|
ch = *ipos++;
|
|
|
|
node = (const void *)((const UInt8 *)cvtptr + toffset);
|
|
ch -= node->base;
|
|
if (ch > node->span) {
|
|
toffset = 0;
|
|
goto bad_char;
|
|
}
|
|
entry =
|
|
(const void *)((const UInt8 *)cvtptr + toffset +
|
|
sizeof(struct CNode) + ch * sizeof(struct CEntry));
|
|
output = entry->output;
|
|
toffset = entry->next;
|
|
if (toffset == 0) {
|
|
// Reached end of tree.
|
|
if (output == 0) {
|
|
goto bad_char;
|
|
}
|
|
*opos++ = output;
|
|
goto next_out;
|
|
}
|
|
if (output != 0) {
|
|
// Can produce output here, or can consume more input. We try
|
|
// consuming more input, but save the state to rewind if that fails.
|
|
savein = ipos;
|
|
saveout = output;
|
|
savetoffset = toffset;
|
|
}
|
|
}
|
|
|
|
bad_char:
|
|
// Bad character. Back up and try again.
|
|
ipos = savein;
|
|
if (saveout != 0) {
|
|
// Produce saved output.
|
|
*opos++ = saveout;
|
|
ch = 0;
|
|
} else {
|
|
// No saved output, this really is a bad character. Consume one UTF-8
|
|
// character, emit it as a fallback, and continue.
|
|
ch = *ipos++;
|
|
if ((ch & 0x80) == 0) {
|
|
// ASCII character: either NUL, CR, or LF, because only these
|
|
// characters will result in a transition to state 0.
|
|
if (ch == 0) {
|
|
*opos++ = ch;
|
|
} else if (ch == kCharLF && lastch == kCharCR) {
|
|
if (lc == kLineBreakKeep) {
|
|
*opos++ = ch;
|
|
}
|
|
} else {
|
|
switch (lc) {
|
|
case kLineBreakKeep:
|
|
*opos++ = ch;
|
|
break;
|
|
case kLineBreakLF:
|
|
*opos++ = kCharLF;
|
|
break;
|
|
case kLineBreakCR:
|
|
*opos++ = kCharCR;
|
|
break;
|
|
case kLineBreakCRLF:
|
|
*opos++ = kCharCR;
|
|
*opos++ = kCharLF;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
if ((ch & 0xe0) == 0xc0) {
|
|
chlen = 1;
|
|
} else if ((ch & 0xf0) == 0xe0) {
|
|
chlen = 2;
|
|
} else if ((ch & 0xf8) == 0xf0) {
|
|
chlen = 3;
|
|
} else {
|
|
chlen = 0;
|
|
}
|
|
for (; chlen > 0; chlen--) {
|
|
if (ipos == iend) {
|
|
goto done;
|
|
}
|
|
ch = *ipos;
|
|
if ((ch & 0xc0) != 0x80) {
|
|
break;
|
|
}
|
|
ipos++;
|
|
}
|
|
*opos++ = kCharSubstitute;
|
|
}
|
|
}
|
|
goto next_out;
|
|
|
|
done:
|
|
state->lastch = ch;
|
|
state->output = saveout;
|
|
state->tableoffset = savetoffset;
|
|
*optr = opos;
|
|
*iptr = savein;
|
|
}
|