syncfiles/convert/convert_1r.c

363 lines
8.2 KiB
C
Raw Normal View History

// Copyright 2022 Dietrich Epp.
// This file is part of SyncFiles. SyncFiles is licensed under the terms of the
// Mozilla Public License, version 2.0. See LICENSE.txt for details.
2022-04-22 15:44:07 +00:00
// convert_1r.c - Reverse conversion from UTF-8 to extended ASCII.
2022-03-23 21:51:55 +00:00
#include "convert/convert.h"
2022-03-25 05:45:55 +00:00
#include "lib/defs.h"
enum {
2022-04-22 15:44:07 +00:00
// Maximum length of encoded character.
kMaxEncodedLength = 8,
2022-04-22 15:44:07 +00:00
// Initial number of nodes to allocate when building the tree.
kInitialTableAlloc = 8
};
struct TEntry {
2022-04-22 15:44:07 +00:00
// The output character, or zero if no output.
UInt8 output;
2022-04-22 15:44:07 +00:00
// The next node, or zero if no next node.
UInt8 next;
};
2022-04-22 15:44:07 +00:00
// A node for building the converter.
struct TNode {
struct TEntry entries[256];
};
struct TTree {
struct TNode **nodes;
int count;
};
static ErrorCode CreateTree(struct TTree *tree, Handle data, Size datasz)
{
struct TNode **nodes, *node;
int i, j, dpos, enclen, encend, state, cur, nodecount, nodealloc;
unsigned ch;
2022-04-22 15:44:07 +00:00
// Create a tree with a root node mapping all the ASCII characters except
// NUL, CR, and LF. NUL won't map because an output of 0 is interpreted as
// no output. CR and LF are removed so they can be handled specially be the
// decoder.
nodes =
(struct TNode **)NewHandle(kInitialTableAlloc * sizeof(struct TNode));
if (nodes == NULL) {
return kErrorNoMemory;
}
nodecount = 1;
nodealloc = kInitialTableAlloc;
node = *nodes;
MemClear(node, sizeof(struct TNode));
for (i = 0; i < 128; i++) {
node->entries[i].output = i;
}
node->entries[kCharLF].output = 0;
node->entries[kCharCR].output = 0;
2022-04-22 15:44:07 +00:00
// Parse the table data and build up a tree of TNode.
dpos = 1;
2022-04-22 15:44:07 +00:00
// For each high character (128..255).
for (i = 0; i < 128; i++) {
2022-04-22 15:44:07 +00:00
// For each encoding of that character.
for (j = 0; j < 2; j++) {
if (dpos >= datasz) {
goto bad_table;
}
enclen = (UInt8)(*data)[dpos++];
if (enclen != 0) {
if (enclen < 2 || enclen > datasz - dpos ||
enclen > kMaxEncodedLength) {
goto bad_table;
}
2022-04-22 15:44:07 +00:00
// Iterate over all but last byte in encoding, to find the node
// which will produce the decoded byte as output.
state = 0;
node = *nodes;
for (encend = dpos + enclen - 1; dpos < encend; dpos++) {
ch = (UInt8)(*data)[dpos];
cur = state;
state = node->entries[ch].next;
if (state == 0) {
if (nodecount >= nodealloc) {
nodealloc *= 2;
if (!ResizeHandle(
(Handle)nodes,
nodealloc * sizeof(struct TNode))) {
DisposeHandle((Handle)nodes);
return kErrorNoMemory;
}
node = *nodes + cur;
}
state = nodecount++;
node->entries[ch].next = state;
node = (*nodes) + state;
MemClear(node, sizeof(*node));
} else {
node = *nodes + state;
}
}
ch = (UInt8)(*data)[dpos++];
if (node->entries[ch].output != 0) {
goto bad_table;
}
node->entries[ch].output = i | 0x80;
}
}
}
if (!ResizeHandle((Handle)nodes, nodecount * sizeof(struct TNode))) {
DisposeHandle((Handle)nodes);
return kErrorNoMemory;
}
tree->nodes = nodes;
tree->count = nodecount;
return 0;
bad_table:
DisposeHandle((Handle)nodes);
return kErrorBadData;
}
struct NodeInfo {
UInt8 min;
UInt8 max;
UInt16 offset;
};
struct CEntry {
UInt16 output;
UInt16 next;
};
2022-04-22 15:44:07 +00:00
// A compressed table node. Followed by an array of centry.
struct CNode {
2022-04-22 15:44:07 +00:00
// First byte in table.
UInt8 base;
2022-04-22 15:44:07 +00:00
// Number of entries in table, minus one.
UInt8 span;
};
static ErrorCode CompactTree(Handle *out, struct TNode **nodes, int nodecount)
{
Handle ctree;
struct TNode *node;
struct NodeInfo **infos, *info;
struct CNode *cnode;
struct CEntry *centry;
int i, j, min, max, count, next;
unsigned offset;
2022-04-22 15:44:07 +00:00
// Figure out where each compacted node will go.
infos = (struct NodeInfo **)NewHandle(sizeof(struct NodeInfo) * nodecount);
if (infos == NULL) {
return kErrorNoMemory;
}
offset = 0;
for (i = 0; i < nodecount; i++) {
node = *nodes + i;
min = 0;
while (node->entries[min].output == 0 && node->entries[min].next == 0) {
min++;
}
max = 255;
while (node->entries[max].output == 0 && node->entries[max].next == 0) {
max--;
}
info = *infos + i;
info->min = min;
info->max = max;
info->offset = offset;
count = max - min + 1;
offset += sizeof(struct CNode) + count * sizeof(struct CEntry);
}
2022-04-22 15:44:07 +00:00
// Create the compacted tree.
ctree = NewHandle(offset);
if (ctree == NULL) {
DisposeHandle((Handle)infos);
return kErrorNoMemory;
}
for (i = 0; i < nodecount; i++) {
node = *nodes + i;
info = *infos + i;
min = info->min;
max = info->max;
offset = info->offset;
cnode = (void *)(*ctree + offset);
cnode->base = min;
cnode->span = max - min;
centry = (void *)(*ctree + offset + sizeof(struct CNode));
for (j = min; j <= max; j++) {
centry->output = node->entries[j].output;
next = node->entries[j].next;
if (next != 0) {
next = (*infos)[next].offset;
}
centry->next = next;
centry++;
}
}
DisposeHandle((Handle)infos);
*out = ctree;
return 0;
}
ErrorCode Convert1rBuild(Handle *out, Handle data, Size datasz)
{
struct TTree table;
ErrorCode err;
err = CreateTree(&table, data, datasz);
if (err != 0) {
return err;
}
err = CompactTree(out, table.nodes, table.count);
DisposeHandle((Handle)table.nodes);
return err;
}
struct Convert1rState {
UInt8 lastch;
UInt8 output;
UInt16 tableoffset;
};
void Convert1rRun(const void *cvtptr, LineBreakConversion lc,
struct ConverterState *stateptr, UInt8 **optr, UInt8 *oend,
const UInt8 **iptr, const UInt8 *iend)
{
struct Convert1rState *state = (struct Convert1rState *)stateptr;
const struct CNode *node;
const struct CEntry *entry;
UInt8 *opos = *optr;
const UInt8 *ipos = *iptr, *savein;
unsigned ch, lastch, chlen, output, saveout, toffset, savetoffset;
ch = state->lastch;
savein = ipos;
saveout = state->output;
toffset = state->tableoffset;
savetoffset = toffset;
if (oend - opos < 2) {
goto done;
}
goto resume;
next_out:
if (oend - opos < 2) {
goto done;
}
2022-04-22 15:44:07 +00:00
// Follow state machine to the end.
savein = ipos;
saveout = 0;
toffset = 0;
savetoffset = 0;
resume:
for (;;) {
if (ipos >= iend) {
goto done;
}
lastch = ch;
ch = *ipos++;
node = (const void *)((const UInt8 *)cvtptr + toffset);
ch -= node->base;
if (ch > node->span) {
toffset = 0;
goto bad_char;
}
entry =
(const void *)((const UInt8 *)cvtptr + toffset +
sizeof(struct CNode) + ch * sizeof(struct CEntry));
output = entry->output;
toffset = entry->next;
if (toffset == 0) {
2022-04-22 15:44:07 +00:00
// Reached end of tree.
if (output == 0) {
goto bad_char;
}
*opos++ = output;
goto next_out;
}
if (output != 0) {
2022-04-22 15:44:07 +00:00
// Can produce output here, or can consume more input. We try
// consuming more input, but save the state to rewind if that fails.
savein = ipos;
saveout = output;
savetoffset = toffset;
}
}
bad_char:
2022-04-22 15:44:07 +00:00
// Bad character. Back up and try again.
ipos = savein;
if (saveout != 0) {
2022-04-22 15:44:07 +00:00
// Produce saved output.
*opos++ = saveout;
ch = 0;
} else {
2022-04-22 15:44:07 +00:00
// No saved output, this really is a bad character. Consume one UTF-8
// character, emit it as a fallback, and continue.
ch = *ipos++;
if ((ch & 0x80) == 0) {
2022-04-22 15:44:07 +00:00
// ASCII character: either NUL, CR, or LF, because only these
// characters will result in a transition to state 0.
if (ch == 0) {
*opos++ = ch;
} else if (ch == kCharLF && lastch == kCharCR) {
if (lc == kLineBreakKeep) {
*opos++ = ch;
}
} else {
switch (lc) {
case kLineBreakKeep:
*opos++ = ch;
break;
case kLineBreakLF:
*opos++ = kCharLF;
break;
case kLineBreakCR:
*opos++ = kCharCR;
break;
case kLineBreakCRLF:
*opos++ = kCharCR;
*opos++ = kCharLF;
break;
}
}
} else {
if ((ch & 0xe0) == 0xc0) {
chlen = 1;
} else if ((ch & 0xf0) == 0xe0) {
chlen = 2;
} else if ((ch & 0xf8) == 0xf0) {
chlen = 3;
} else {
chlen = 0;
}
for (; chlen > 0; chlen--) {
if (ipos == iend) {
goto done;
}
ch = *ipos;
if ((ch & 0xc0) != 0x80) {
break;
}
ipos++;
}
*opos++ = kCharSubstitute;
}
}
goto next_out;
done:
state->lastch = ch;
state->output = saveout;
state->tableoffset = savetoffset;
*optr = opos;
*iptr = savein;
}