mirror of
https://github.com/depp/syncfiles.git
synced 2025-01-18 10:30:20 +00:00
Implement extended ASCII converter engine
This adds support for the simplest 8-bit character encodings, which are compatible with ASCII.
This commit is contained in:
parent
612aad382f
commit
c96bb9cd0a
12
.vscode/c_cpp_properties.json
vendored
Normal file
12
.vscode/c_cpp_properties.json
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Linux",
|
||||
"includePath": ["${workspaceFolder}"],
|
||||
"cStandard": "c89",
|
||||
"intelliSenseMode": "linux-clang-x64",
|
||||
"compilerPath": "/usr/bin/clang"
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
}
|
@ -4,9 +4,11 @@ go_binary(
|
||||
name = "macscript",
|
||||
srcs = [
|
||||
"data.go",
|
||||
"filenames.go",
|
||||
"main.go",
|
||||
"rez.go",
|
||||
"scriptmap.go",
|
||||
"source.go",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
|
26
gen/filenames.go
Normal file
26
gen/filenames.go
Normal file
@ -0,0 +1,26 @@
|
||||
package main
|
||||
|
||||
import "strconv"
|
||||
|
||||
func writeFilenames(charmaps []string, filename string) error {
|
||||
s, err := createCSource(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
w := s.writer
|
||||
w.WriteString(header)
|
||||
w.WriteString(
|
||||
"#include \"src/test.h\"\n" +
|
||||
"const char *const kCharsetFilename[] = {\n")
|
||||
for _, fn := range charmaps {
|
||||
if fn != "" {
|
||||
w.WriteByte('\t')
|
||||
w.WriteString(strconv.Quote(fn))
|
||||
w.WriteString(",\n")
|
||||
}
|
||||
}
|
||||
w.WriteString("\tNULL\n};\n")
|
||||
|
||||
return s.flush()
|
||||
}
|
@ -19,6 +19,7 @@ var (
|
||||
flagDest string
|
||||
flagSrc string
|
||||
flagQuiet bool
|
||||
flagFormat bool
|
||||
)
|
||||
|
||||
func getSrcdir() (string, error) {
|
||||
@ -90,6 +91,9 @@ func mainE() error {
|
||||
if err := writeMap(&d, m, filepath.Join(destdir, "charmap.c")); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeFilenames(cms, filepath.Join(destdir, "charmap_name.c")); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeRez(&d, cms, filepath.Join(destdir, "charmap.r")); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -100,6 +104,7 @@ func main() {
|
||||
flag.StringVar(&flagDest, "dest", "", "output directory")
|
||||
flag.StringVar(&flagSrc, "src", "", "source directory")
|
||||
flag.BoolVar(&flagQuiet, "quiet", false, "only output error messages")
|
||||
flag.BoolVar(&flagFormat, "format", true, "run clang-format on C output")
|
||||
flag.Parse()
|
||||
if args := flag.Args(); len(args) != 0 {
|
||||
fmt.Fprintf(os.Stderr, "Error: unexpected argument: %q\n", args[0])
|
||||
|
@ -1,10 +1,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
)
|
||||
|
||||
@ -73,17 +70,13 @@ func genMap(d *scriptdata) []*scriptmap {
|
||||
// writeMap writes out a C function that returns the correct character map for a
|
||||
// given script and region.
|
||||
func writeMap(d *scriptdata, m []*scriptmap, filename string) error {
|
||||
if !flagQuiet {
|
||||
fmt.Fprintln(os.Stderr, "Writing:", filename)
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename)
|
||||
s, err := createCSource(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fp.Close()
|
||||
w := bufio.NewWriter(fp)
|
||||
defer s.close()
|
||||
|
||||
w := s.writer
|
||||
w.WriteString(header)
|
||||
w.WriteString(
|
||||
"#include \"src/convert.h\"\n" +
|
||||
@ -120,17 +113,5 @@ func writeMap(d *scriptdata, m []*scriptmap, filename string) error {
|
||||
"}\n" +
|
||||
"}\n")
|
||||
|
||||
if err := w.Flush(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := fp.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cmd := exec.Command("clang-format", "-i", filename)
|
||||
if err := cmd.Run(); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Warning: clang-format failed:", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
return s.flush()
|
||||
}
|
||||
|
64
gen/source.go
Normal file
64
gen/source.go
Normal file
@ -0,0 +1,64 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
type csource struct {
|
||||
filename string
|
||||
file *os.File
|
||||
writer *bufio.Writer
|
||||
}
|
||||
|
||||
func createCSource(filename string) (s csource, err error) {
|
||||
if !flagQuiet {
|
||||
fmt.Fprintln(os.Stderr, "Writing:", filename)
|
||||
}
|
||||
|
||||
fp, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
return csource{
|
||||
filename: filename,
|
||||
file: fp,
|
||||
writer: bufio.NewWriter(fp),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *csource) close() {
|
||||
if s.file != nil {
|
||||
s.file.Close()
|
||||
s.file = nil
|
||||
}
|
||||
if s.filename != "" {
|
||||
os.Remove(s.filename)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *csource) flush() error {
|
||||
if s.file == nil {
|
||||
panic("already closed")
|
||||
}
|
||||
err := s.writer.Flush()
|
||||
s.writer = nil
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = s.file.Close()
|
||||
s.file = nil
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if flagFormat {
|
||||
cmd := exec.Command("clang-format", "-i", s.filename)
|
||||
if err := cmd.Run(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
s.filename = ""
|
||||
return nil
|
||||
}
|
@ -1,15 +1,7 @@
|
||||
load("@rules_cc//cc:defs.bzl", "cc_library")
|
||||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
|
||||
load("//bazel:copts.bzl", "COPTS")
|
||||
|
||||
genrule(
|
||||
name = "data",
|
||||
srcs = [
|
||||
"//charmap:data",
|
||||
"//scripts:data",
|
||||
],
|
||||
outs = [
|
||||
"charmap.c",
|
||||
"charmap.r",
|
||||
_data = [
|
||||
"charmap_roman.dat",
|
||||
"charmap_turkish.dat",
|
||||
"charmap_croatian.dat",
|
||||
@ -21,7 +13,19 @@ genrule(
|
||||
"charmap_cyrillic.dat",
|
||||
"charmap_inuit.dat",
|
||||
"charmap_centeuro.dat",
|
||||
]
|
||||
|
||||
genrule(
|
||||
name = "data",
|
||||
srcs = [
|
||||
"//charmap:data",
|
||||
"//scripts:data",
|
||||
],
|
||||
outs = [
|
||||
"charmap.c",
|
||||
"charmap.r",
|
||||
"charmap_name.c",
|
||||
] + _data,
|
||||
cmd = "$(execpath //gen:macscript) -dest=$(RULEDIR) -src=. -quiet",
|
||||
tools = [
|
||||
"//gen:macscript",
|
||||
@ -32,9 +36,26 @@ cc_library(
|
||||
name = "convert",
|
||||
srcs = [
|
||||
"charmap.c",
|
||||
"convert.c",
|
||||
"convert.h",
|
||||
"convert_1f.c",
|
||||
"convert_1r.c",
|
||||
"defs.h",
|
||||
"test.h",
|
||||
"toolbox.c",
|
||||
],
|
||||
copts = COPTS,
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "convert_test",
|
||||
srcs = [
|
||||
"charmap_name.c",
|
||||
"convert_test.c",
|
||||
],
|
||||
copts = COPTS,
|
||||
data = _data,
|
||||
deps = [
|
||||
":convert",
|
||||
],
|
||||
)
|
||||
|
10
src/README.md
Normal file
10
src/README.md
Normal file
@ -0,0 +1,10 @@
|
||||
# Converter
|
||||
|
||||
## Debugging
|
||||
|
||||
Tests can be debugged with GDB:
|
||||
|
||||
```shell
|
||||
bazel build -c dbg //src:convert_test
|
||||
gdb -ex 'dir .' -ex 'cd bazel-bin' bazel-bin/src/convert_test
|
||||
```
|
38
src/convert.c
Normal file
38
src/convert.c
Normal file
@ -0,0 +1,38 @@
|
||||
#include "src/convert.h"
|
||||
|
||||
struct ConvertEngine {
|
||||
ConvertBuildf build;
|
||||
ConvertRunf run;
|
||||
};
|
||||
|
||||
const struct ConvertEngine kEngines[][2] = {
|
||||
{{Convert1fBuild, Convert1fRun}, {Convert1rBuild, Convert1rRun}}};
|
||||
|
||||
int ConverterBuild(struct Converter *c, Handle data, Size datasz,
|
||||
ConvertDirection direction, OSErr *errp)
|
||||
{
|
||||
int engine, r;
|
||||
const struct ConvertEngine *funcs;
|
||||
Handle out;
|
||||
|
||||
if (datasz == 0) {
|
||||
return kErrorBadData;
|
||||
}
|
||||
engine = (UInt8) * *data - 1;
|
||||
if (engine < 0 || (int)(sizeof(kEngines) / sizeof(*kEngines)) <= engine) {
|
||||
/* Invalid engine. */
|
||||
return kErrorBadData;
|
||||
}
|
||||
funcs = &kEngines[engine][direction];
|
||||
if (funcs->build == NULL || funcs->run == NULL) {
|
||||
/* Invalid engine. */
|
||||
return kErrorBadData;
|
||||
}
|
||||
r = funcs->build(&out, data, datasz, errp);
|
||||
if (r != 0) {
|
||||
return r;
|
||||
}
|
||||
c->data = out;
|
||||
c->run = funcs->run;
|
||||
return 0;
|
||||
}
|
@ -2,8 +2,94 @@
|
||||
#define convert_h
|
||||
/* convert.h - character set conversion routines. */
|
||||
|
||||
#include "src/defs.h"
|
||||
|
||||
/* Error codes. */
|
||||
enum
|
||||
{
|
||||
/* No error. */
|
||||
kErrorOK,
|
||||
|
||||
/* Memory allocation failed. */
|
||||
kErrorNoMemory,
|
||||
|
||||
/* Invaild table data. */
|
||||
kErrorBadData
|
||||
};
|
||||
|
||||
enum
|
||||
{
|
||||
/* Constants for CR and LF. Note that we should not use '\n' or '\r'
|
||||
anywhere, because these character constants may have unexpected values on
|
||||
certain old Mac OS compilers, depending on the compiler settings. In
|
||||
particular, the values of '\n' and '\r' will be swapped. */
|
||||
kCharLF = 10,
|
||||
kCharCR = 13,
|
||||
|
||||
/* Constant for substitution character: '?'. */
|
||||
kCharSubstitute = 63
|
||||
};
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* Don't translite line breaks. */
|
||||
kLineBreakKeep,
|
||||
|
||||
/* Convert line breaks to LF. */
|
||||
kLineBreakLF,
|
||||
|
||||
/* Convert line breaks to CR. */
|
||||
kLineBreakCR,
|
||||
|
||||
/* Convert line breaks to CR LF. */
|
||||
kLineBreakCRLF
|
||||
} LineBreakConversion;
|
||||
|
||||
/* Directions that the converter runs in. */
|
||||
typedef enum
|
||||
{
|
||||
kToUTF8,
|
||||
kFromUTF8
|
||||
} ConvertDirection;
|
||||
|
||||
/* Get the character map used for the given Mac OS script and region codes.
|
||||
Return -1 if no known character map exists. */
|
||||
int GetCharmap(int script, int region);
|
||||
|
||||
/* The state of a converter. Must be zeroed prior to first conversion. */
|
||||
struct ConverterState {
|
||||
UInt32 data;
|
||||
};
|
||||
|
||||
/* Implementation function for building a converter. */
|
||||
typedef int (*ConvertBuildf)(Handle *out, Handle data, Size datasz,
|
||||
OSErr *errp);
|
||||
|
||||
/* Implementation function for running a converter. */
|
||||
typedef void (*ConvertRunf)(const void *cvtptr, LineBreakConversion lc,
|
||||
struct ConverterState *stateptr, UInt8 **optr,
|
||||
UInt8 *oend, const UInt8 **iptr, const UInt8 *iend);
|
||||
|
||||
/* A converter. The converter can be freed by disposing the handle. */
|
||||
struct Converter {
|
||||
Handle data;
|
||||
ConvertRunf run;
|
||||
};
|
||||
|
||||
/* Build a converter from the given conversion table data. */
|
||||
int ConverterBuild(struct Converter *c, Handle data, Size datasz,
|
||||
ConvertDirection direction, OSErr *errp);
|
||||
|
||||
/* Engine 1: extended ASCII */
|
||||
|
||||
int Convert1fBuild(Handle *out, Handle data, Size datasz, OSErr *errp);
|
||||
void Convert1fRun(const void *cvtptr, LineBreakConversion lc,
|
||||
struct ConverterState *stateptr, UInt8 **optr, UInt8 *oend,
|
||||
const UInt8 **iptr, const UInt8 *iend);
|
||||
|
||||
int Convert1rBuild(Handle *out, Handle data, Size datasz, OSErr *errp);
|
||||
void Convert1rRun(const void *cvtptr, LineBreakConversion lc,
|
||||
struct ConverterState *stateptr, UInt8 **optr, UInt8 *oend,
|
||||
const UInt8 **iptr, const UInt8 *iend);
|
||||
|
||||
#endif
|
||||
|
127
src/convert_1f.c
Normal file
127
src/convert_1f.c
Normal file
@ -0,0 +1,127 @@
|
||||
/* convert_1f.c - Forward conversion from extended ASCII to UTF-8. */
|
||||
#include "src/convert.h"
|
||||
#include "src/defs.h"
|
||||
|
||||
struct Convert1fData {
|
||||
/* Unicode characters, encoded in UTF-8, and packed MSB first. Always either
|
||||
2 bytes or 3 bytes. */
|
||||
UInt32 chars[128];
|
||||
};
|
||||
|
||||
struct Convert1fState {
|
||||
UInt8 lastch;
|
||||
};
|
||||
|
||||
int Convert1fBuild(Handle *out, Handle data, Size datasz, OSErr *errp)
|
||||
{
|
||||
Handle h;
|
||||
struct Convert1fData *cvt;
|
||||
int i, n;
|
||||
UInt32 uch;
|
||||
const UInt8 *dptr, *dend;
|
||||
|
||||
h = NewHandle(sizeof(struct Convert1fData));
|
||||
if (h == NULL) {
|
||||
*errp = MemError();
|
||||
return kErrorNoMemory;
|
||||
}
|
||||
cvt = (void *)*h;
|
||||
dptr = (void *)*data;
|
||||
dptr++;
|
||||
dend = dptr + datasz;
|
||||
for (i = 0; i < 128; i++) {
|
||||
if (dptr == dend) {
|
||||
goto bad_table;
|
||||
}
|
||||
n = *dptr++;
|
||||
if (n < 2 || 3 < n) {
|
||||
goto bad_table;
|
||||
}
|
||||
if (dend - dptr < n) {
|
||||
goto bad_table;
|
||||
}
|
||||
uch = 0;
|
||||
while (n-- > 0) {
|
||||
uch = (uch << 8) | *dptr++;
|
||||
}
|
||||
cvt->chars[i] = uch;
|
||||
if (dptr == dend) {
|
||||
goto bad_table;
|
||||
}
|
||||
n = *dptr++;
|
||||
if (dend - dptr < n) {
|
||||
goto bad_table;
|
||||
}
|
||||
dptr += n;
|
||||
}
|
||||
*out = h;
|
||||
return 0;
|
||||
|
||||
bad_table:
|
||||
DisposeHandle(h);
|
||||
return kErrorBadData;
|
||||
}
|
||||
|
||||
void Convert1fRun(const void *cvtptr, LineBreakConversion lc,
|
||||
struct ConverterState *stateptr, UInt8 **optr, UInt8 *oend,
|
||||
const UInt8 **iptr, const UInt8 *iend)
|
||||
{
|
||||
const struct Convert1fData *cvt = cvtptr;
|
||||
struct Convert1fState *state = (struct Convert1fState *)stateptr;
|
||||
UInt8 *opos = *optr;
|
||||
const UInt8 *ipos = *iptr;
|
||||
unsigned ch, lastch;
|
||||
UInt32 uch;
|
||||
|
||||
ch = state->lastch;
|
||||
while (ipos < iend && oend - opos >= 3) {
|
||||
lastch = ch;
|
||||
ch = *ipos++;
|
||||
if (ch < 128) {
|
||||
if (ch == kCharLF || ch == kCharCR) {
|
||||
/* Line breaks. */
|
||||
if (ch == kCharLF && lastch == kCharCR) {
|
||||
if (lc == kLineBreakKeep) {
|
||||
*opos++ = ch;
|
||||
}
|
||||
} else {
|
||||
switch (lc) {
|
||||
case kLineBreakKeep:
|
||||
*opos++ = ch;
|
||||
break;
|
||||
case kLineBreakLF:
|
||||
*opos++ = kCharLF;
|
||||
break;
|
||||
case kLineBreakCR:
|
||||
*opos++ = kCharCR;
|
||||
break;
|
||||
case kLineBreakCRLF:
|
||||
*opos++ = kCharCR;
|
||||
*opos++ = kCharLF;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* ASCII characters. */
|
||||
*opos++ = ch;
|
||||
}
|
||||
} else {
|
||||
/* Unicode characters. */
|
||||
uch = cvt->chars[ch - 128];
|
||||
if (uch > 0xffff) {
|
||||
opos[0] = uch >> 16;
|
||||
opos[1] = uch >> 8;
|
||||
opos[2] = uch;
|
||||
opos += 3;
|
||||
} else {
|
||||
opos[0] = uch >> 8;
|
||||
opos[1] = uch;
|
||||
opos += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
state->lastch = ch;
|
||||
|
||||
*optr = opos;
|
||||
*iptr = ipos;
|
||||
}
|
368
src/convert_1r.c
Normal file
368
src/convert_1r.c
Normal file
@ -0,0 +1,368 @@
|
||||
/* convert_1r.c - Reverse conversion from UTF-8 to extended ASCII. */
|
||||
#include "src/convert.h"
|
||||
#include "src/defs.h"
|
||||
|
||||
enum
|
||||
{
|
||||
/* Maximum length of encoded character. */
|
||||
kMaxEncodedLength = 8,
|
||||
|
||||
/* Initial number of nodes to allocate when building the tree. */
|
||||
kInitialTableAlloc = 8
|
||||
};
|
||||
|
||||
struct TEntry {
|
||||
/* The output character, or zero if no output. */
|
||||
UInt8 output;
|
||||
/* The next node, or zero if no next node. */
|
||||
UInt8 next;
|
||||
};
|
||||
|
||||
/* A node for building the converter. */
|
||||
struct TNode {
|
||||
struct TEntry entries[256];
|
||||
};
|
||||
|
||||
struct TTree {
|
||||
struct TNode **nodes;
|
||||
int count;
|
||||
};
|
||||
|
||||
static int CreateTree(struct TTree *tree, Handle data, Size datasz, OSErr *errp)
|
||||
{
|
||||
struct TNode **nodes, *node;
|
||||
int i, j, dpos, enclen, encend, state, cur, nodecount, nodealloc;
|
||||
unsigned ch;
|
||||
OSErr err;
|
||||
|
||||
/* Create a tree with a root node mapping all the ASCII characters except
|
||||
NUL, CR, and LF. NUL won't map because an output of 0 is interpreted as
|
||||
no output. CR and LF are removed so they can be handled specially be the
|
||||
decoder. */
|
||||
nodes =
|
||||
(struct TNode **)NewHandle(kInitialTableAlloc * sizeof(struct TNode));
|
||||
if (nodes == NULL) {
|
||||
err = MemError();
|
||||
goto have_error;
|
||||
}
|
||||
nodecount = 1;
|
||||
nodealloc = kInitialTableAlloc;
|
||||
node = *nodes;
|
||||
MemClear(node, sizeof(struct TNode));
|
||||
for (i = 0; i < 128; i++) {
|
||||
node->entries[i].output = i;
|
||||
}
|
||||
node->entries[kCharLF].output = 0;
|
||||
node->entries[kCharCR].output = 0;
|
||||
|
||||
/* Parse the table data and build up a tree of TNode. */
|
||||
dpos = 1;
|
||||
/* For each high character (128..255). */
|
||||
for (i = 0; i < 128; i++) {
|
||||
/* For each encoding of that character. */
|
||||
for (j = 0; j < 2; j++) {
|
||||
if (dpos >= datasz) {
|
||||
goto bad_table;
|
||||
}
|
||||
enclen = (UInt8)(*data)[dpos++];
|
||||
if (enclen != 0) {
|
||||
if (enclen < 2 || enclen > datasz - dpos ||
|
||||
enclen > kMaxEncodedLength) {
|
||||
goto bad_table;
|
||||
}
|
||||
/* Iterate over all but last byte in encoding, to find the node
|
||||
which will produce the decoded byte as output. */
|
||||
state = 0;
|
||||
node = *nodes;
|
||||
for (encend = dpos + enclen - 1; dpos < encend; dpos++) {
|
||||
ch = (UInt8)(*data)[dpos];
|
||||
cur = state;
|
||||
state = node->entries[ch].next;
|
||||
if (state == 0) {
|
||||
if (nodecount >= nodealloc) {
|
||||
nodealloc *= 2;
|
||||
SetHandleSize((Handle)nodes,
|
||||
nodealloc * sizeof(struct TNode));
|
||||
err = MemError();
|
||||
if (err != 0) {
|
||||
goto have_error;
|
||||
}
|
||||
node = *nodes + cur;
|
||||
}
|
||||
state = nodecount++;
|
||||
node->entries[ch].next = state;
|
||||
node = (*nodes) + state;
|
||||
MemClear(node, sizeof(*node));
|
||||
} else {
|
||||
node = *nodes + state;
|
||||
}
|
||||
}
|
||||
ch = (UInt8)(*data)[dpos++];
|
||||
if (node->entries[ch].output != 0) {
|
||||
goto bad_table;
|
||||
}
|
||||
node->entries[ch].output = i | 0x80;
|
||||
}
|
||||
}
|
||||
}
|
||||
SetHandleSize((Handle)nodes, nodecount * sizeof(struct TNode));
|
||||
tree->nodes = nodes;
|
||||
tree->count = nodecount;
|
||||
return 0;
|
||||
|
||||
bad_table:
|
||||
DisposeHandle((Handle)nodes);
|
||||
return kErrorBadData;
|
||||
|
||||
have_error:
|
||||
DisposeHandle((Handle)nodes);
|
||||
*errp = err;
|
||||
return kErrorNoMemory;
|
||||
}
|
||||
|
||||
struct NodeInfo {
|
||||
UInt8 min;
|
||||
UInt8 max;
|
||||
UInt16 offset;
|
||||
};
|
||||
|
||||
struct CEntry {
|
||||
UInt16 output;
|
||||
UInt16 next;
|
||||
};
|
||||
|
||||
/* A compressed table node. Followed by an array of centry. */
|
||||
struct CNode {
|
||||
/* First byte in table. */
|
||||
UInt8 base;
|
||||
/* Number of entries in table, minus one. */
|
||||
UInt8 span;
|
||||
};
|
||||
|
||||
static int CompactTree(Handle *out, struct TNode **nodes, int nodecount,
|
||||
OSErr *errp)
|
||||
{
|
||||
Handle ctree;
|
||||
struct TNode *node;
|
||||
struct NodeInfo **infos, *info;
|
||||
struct CNode *cnode;
|
||||
struct CEntry *centry;
|
||||
int i, j, min, max, count, next;
|
||||
unsigned offset;
|
||||
|
||||
/* Figure out where each compacted node will go. */
|
||||
infos = (struct NodeInfo **)NewHandle(sizeof(struct NodeInfo) * nodecount);
|
||||
if (infos == NULL) {
|
||||
*errp = MemError();
|
||||
return kErrorNoMemory;
|
||||
}
|
||||
offset = 0;
|
||||
for (i = 0; i < nodecount; i++) {
|
||||
node = *nodes + i;
|
||||
min = 0;
|
||||
while (node->entries[min].output == 0 && node->entries[min].next == 0) {
|
||||
min++;
|
||||
}
|
||||
max = 255;
|
||||
while (node->entries[max].output == 0 && node->entries[max].next == 0) {
|
||||
max--;
|
||||
}
|
||||
info = *infos + i;
|
||||
info->min = min;
|
||||
info->max = max;
|
||||
info->offset = offset;
|
||||
count = max - min + 1;
|
||||
offset += sizeof(struct CNode) + count * sizeof(struct CEntry);
|
||||
}
|
||||
|
||||
/* Create the compacted tree. */
|
||||
ctree = NewHandle(offset);
|
||||
if (ctree == NULL) {
|
||||
*errp = MemError();
|
||||
DisposeHandle((Handle)infos);
|
||||
return kErrorNoMemory;
|
||||
}
|
||||
for (i = 0; i < nodecount; i++) {
|
||||
node = *nodes + i;
|
||||
info = *infos + i;
|
||||
min = info->min;
|
||||
max = info->max;
|
||||
offset = info->offset;
|
||||
cnode = (void *)(*ctree + offset);
|
||||
cnode->base = min;
|
||||
cnode->span = max - min;
|
||||
centry = (void *)(*ctree + offset + sizeof(struct CNode));
|
||||
for (j = min; j <= max; j++) {
|
||||
centry->output = node->entries[j].output;
|
||||
next = node->entries[j].next;
|
||||
if (next != 0) {
|
||||
next = (*infos)[next].offset;
|
||||
}
|
||||
centry->next = next;
|
||||
centry++;
|
||||
}
|
||||
}
|
||||
|
||||
DisposeHandle((Handle)infos);
|
||||
*out = ctree;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Convert1rBuild(Handle *out, Handle data, Size datasz, OSErr *errp)
|
||||
{
|
||||
struct TTree table;
|
||||
int r;
|
||||
|
||||
r = CreateTree(&table, data, datasz, errp);
|
||||
if (r != 0) {
|
||||
return r;
|
||||
}
|
||||
r = CompactTree(out, table.nodes, table.count, errp);
|
||||
DisposeHandle((Handle)table.nodes);
|
||||
return r;
|
||||
}
|
||||
|
||||
struct Convert1rState {
|
||||
UInt8 lastch;
|
||||
UInt8 output;
|
||||
UInt16 tableoffset;
|
||||
};
|
||||
|
||||
void Convert1rRun(const void *cvtptr, LineBreakConversion lc,
|
||||
struct ConverterState *stateptr, UInt8 **optr, UInt8 *oend,
|
||||
const UInt8 **iptr, const UInt8 *iend)
|
||||
{
|
||||
struct Convert1rState *state = (struct Convert1rState *)stateptr;
|
||||
const struct CNode *node;
|
||||
const struct CEntry *entry;
|
||||
UInt8 *opos = *optr;
|
||||
const UInt8 *ipos = *iptr, *savein;
|
||||
unsigned ch, lastch, chlen, output, saveout, toffset, savetoffset;
|
||||
|
||||
ch = state->lastch;
|
||||
savein = ipos;
|
||||
saveout = state->output;
|
||||
toffset = state->tableoffset;
|
||||
savetoffset = toffset;
|
||||
if (oend - opos < 2) {
|
||||
goto done;
|
||||
}
|
||||
goto resume;
|
||||
|
||||
next_out:
|
||||
if (oend - opos < 2) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Follow state machine to the end. */
|
||||
savein = ipos;
|
||||
saveout = 0;
|
||||
toffset = 0;
|
||||
savetoffset = 0;
|
||||
resume:
|
||||
for (;;) {
|
||||
if (ipos >= iend) {
|
||||
goto done;
|
||||
}
|
||||
lastch = ch;
|
||||
ch = *ipos++;
|
||||
|
||||
node = (const void *)((const UInt8 *)cvtptr + toffset);
|
||||
ch -= node->base;
|
||||
if (ch > node->span) {
|
||||
toffset = 0;
|
||||
goto bad_char;
|
||||
}
|
||||
entry =
|
||||
(const void *)((const UInt8 *)cvtptr + toffset +
|
||||
sizeof(struct CNode) + ch * sizeof(struct CEntry));
|
||||
output = entry->output;
|
||||
toffset = entry->next;
|
||||
if (toffset == 0) {
|
||||
/* Reached end of tree. */
|
||||
if (output == 0) {
|
||||
goto bad_char;
|
||||
}
|
||||
*opos++ = output;
|
||||
goto next_out;
|
||||
}
|
||||
if (output != 0) {
|
||||
/* Can produce output here, or can consume more input. We try
|
||||
consuming more input, but save the state to rewind if that
|
||||
fails. */
|
||||
savein = ipos;
|
||||
saveout = output;
|
||||
savetoffset = toffset;
|
||||
}
|
||||
}
|
||||
|
||||
bad_char:
|
||||
/* Bad character. Back up and try again. */
|
||||
ipos = savein;
|
||||
if (saveout != 0) {
|
||||
/* Produce saved output. */
|
||||
*opos++ = saveout;
|
||||
ch = 0;
|
||||
} else {
|
||||
/* No saved output, this really is a bad character. Consume one
|
||||
UTF-8 character, emit it as a fallback, and continue. */
|
||||
ch = *ipos++;
|
||||
if ((ch & 0x80) == 0) {
|
||||
/* ASCII character: either NUL, CR, or LF, because only
|
||||
these
|
||||
characters will result in a transition to state 0. */
|
||||
if (ch == 0) {
|
||||
*opos++ = ch;
|
||||
} else if (ch == kCharLF && lastch == kCharCR) {
|
||||
if (lc == kLineBreakKeep) {
|
||||
*opos++ = ch;
|
||||
}
|
||||
} else {
|
||||
switch (lc) {
|
||||
case kLineBreakKeep:
|
||||
*opos++ = ch;
|
||||
break;
|
||||
case kLineBreakLF:
|
||||
*opos++ = kCharLF;
|
||||
break;
|
||||
case kLineBreakCR:
|
||||
*opos++ = kCharCR;
|
||||
break;
|
||||
case kLineBreakCRLF:
|
||||
*opos++ = kCharCR;
|
||||
*opos++ = kCharLF;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ((ch & 0xe0) == 0xc0) {
|
||||
chlen = 1;
|
||||
} else if ((ch & 0xf0) == 0xe0) {
|
||||
chlen = 2;
|
||||
} else if ((ch & 0xf8) == 0xf0) {
|
||||
chlen = 3;
|
||||
} else {
|
||||
chlen = 0;
|
||||
}
|
||||
for (; chlen > 0; chlen--) {
|
||||
if (ipos == iend) {
|
||||
goto done;
|
||||
}
|
||||
ch = *ipos;
|
||||
if ((ch & 0xc0) != 0x80) {
|
||||
break;
|
||||
}
|
||||
ipos++;
|
||||
}
|
||||
*opos++ = kCharSubstitute;
|
||||
}
|
||||
}
|
||||
goto next_out;
|
||||
|
||||
done:
|
||||
state->lastch = ch;
|
||||
state->output = saveout;
|
||||
state->tableoffset = savetoffset;
|
||||
*optr = opos;
|
||||
*iptr = savein;
|
||||
}
|
321
src/convert_test.c
Normal file
321
src/convert_test.c
Normal file
@ -0,0 +1,321 @@
|
||||
/* Converter test. */
|
||||
#define _XOPEN_SOURCE 500
|
||||
|
||||
#include "src/convert.h"
|
||||
#include "src/test.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
enum
|
||||
{
|
||||
kInitialBufSize = 4 * 1024,
|
||||
kConvertBufferSize = 1024
|
||||
};
|
||||
|
||||
static int gFailCount;
|
||||
static char gTestName[128];
|
||||
|
||||
static void Failf(const char *msg, ...) __attribute__((format(printf, 1, 2)));
|
||||
|
||||
static void Failf(const char *msg, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
gFailCount++;
|
||||
fputs("Error: ", stderr);
|
||||
fputs(gTestName, stderr);
|
||||
fputs(": ", stderr);
|
||||
va_start(ap, msg);
|
||||
vfprintf(stderr, msg, ap);
|
||||
va_end(ap);
|
||||
fputc('\n', stderr);
|
||||
if (gFailCount >= 10) {
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
static const char *const kErrorNames[] = {"ok", "no memory", "bad data"};
|
||||
|
||||
static const char *ErrorName(int err)
|
||||
{
|
||||
if (err < 0 || (int)(sizeof(kErrorNames) / sizeof(*kErrorNames)) <= err) {
|
||||
Dief("bad error code: %d", err);
|
||||
}
|
||||
return kErrorNames[err];
|
||||
}
|
||||
|
||||
static void StringPrintf(char *dest, size_t destsz, const char *fmt, ...)
|
||||
__attribute__((format(printf, 3, 4)));
|
||||
|
||||
static void StringPrintf(char *dest, size_t destsz, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
int n;
|
||||
|
||||
va_start(ap, fmt);
|
||||
n = vsnprintf(dest, destsz, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
if (n < 0 || n >= (int)destsz) {
|
||||
Dief("snprintf: overflow");
|
||||
}
|
||||
}
|
||||
|
||||
/* Read a file in its entirety. */
|
||||
static void ReadFile(const char *filename, void **datap, size_t *sizep)
|
||||
{
|
||||
char fnbuf[128];
|
||||
FILE *fp = NULL;
|
||||
char *buf = NULL, *newbuf;
|
||||
size_t size, alloc, newalloc, amt;
|
||||
int err;
|
||||
|
||||
StringPrintf(fnbuf, sizeof(fnbuf), "src/%s", filename);
|
||||
|
||||
fp = fopen(fnbuf, "rb");
|
||||
if (fp == NULL) {
|
||||
err = errno;
|
||||
goto error;
|
||||
}
|
||||
buf = malloc(kInitialBufSize);
|
||||
if (buf == NULL) {
|
||||
err = errno;
|
||||
goto error;
|
||||
}
|
||||
size = 0;
|
||||
alloc = kInitialBufSize;
|
||||
for (;;) {
|
||||
if (size >= alloc) {
|
||||
newalloc = alloc * 2;
|
||||
newbuf = realloc(buf, newalloc);
|
||||
if (newbuf == NULL) {
|
||||
err = errno;
|
||||
goto error;
|
||||
}
|
||||
alloc = newalloc;
|
||||
buf = newbuf;
|
||||
}
|
||||
amt = fread(buf + size, 1, alloc - size, fp);
|
||||
if (amt == 0) {
|
||||
if (feof(fp)) {
|
||||
break;
|
||||
}
|
||||
err = errno;
|
||||
goto error;
|
||||
}
|
||||
size += amt;
|
||||
}
|
||||
fclose(fp);
|
||||
*datap = buf;
|
||||
*sizep = size;
|
||||
return;
|
||||
|
||||
error:
|
||||
if (fp != NULL) {
|
||||
fclose(fp);
|
||||
}
|
||||
if (buf != NULL) {
|
||||
free(buf);
|
||||
}
|
||||
DieErrorf(err, "read %s", filename);
|
||||
}
|
||||
|
||||
static UInt8 *gBuffer[3];
|
||||
|
||||
static void PrintQuotedString(const UInt8 *buf, int len)
|
||||
{
|
||||
int i, c;
|
||||
|
||||
fputc('"', stderr);
|
||||
for (i = 0; i < len; i++) {
|
||||
c = buf[i];
|
||||
if (32 <= c && c <= 126) {
|
||||
if (c == '\\' || c == '"') {
|
||||
fputc('\\', stderr);
|
||||
}
|
||||
fputc(c, stderr);
|
||||
} else {
|
||||
fprintf(stderr, "\\x%02x", c);
|
||||
}
|
||||
}
|
||||
fputc('"', stderr);
|
||||
}
|
||||
|
||||
static void Check(int len0, int len1, int len2)
|
||||
{
|
||||
int i, n, col, diffcol, c1, c2;
|
||||
|
||||
if (len0 == len2 && memcmp(gBuffer[0], gBuffer[2], len2) == 0) {
|
||||
return;
|
||||
}
|
||||
Failf("incorrect output");
|
||||
n = len0;
|
||||
if (n > len2) {
|
||||
n = len2;
|
||||
}
|
||||
diffcol = -1;
|
||||
col = 0;
|
||||
for (i = 0; i < n; i++) {
|
||||
c1 = gBuffer[0][i];
|
||||
c2 = gBuffer[2][i];
|
||||
if (c1 != c2) {
|
||||
diffcol = col;
|
||||
break;
|
||||
}
|
||||
if (32 <= c1 && c1 <= 126) {
|
||||
col++;
|
||||
if (c1 == '\\' || c1 == '"') {
|
||||
col++;
|
||||
}
|
||||
} else {
|
||||
col += 4;
|
||||
}
|
||||
}
|
||||
fputs("Input: ", stderr);
|
||||
PrintQuotedString(gBuffer[1], len1);
|
||||
fputc('\n', stderr);
|
||||
fputs("Expect: ", stderr);
|
||||
PrintQuotedString(gBuffer[0], len0);
|
||||
fputc('\n', stderr);
|
||||
fputs("Output: ", stderr);
|
||||
PrintQuotedString(gBuffer[2], len2);
|
||||
fputc('\n', stderr);
|
||||
if (diffcol >= 0) {
|
||||
for (i = 0; i < diffcol + 9; i++) {
|
||||
fputc(' ', stderr);
|
||||
}
|
||||
fputc('^', stderr);
|
||||
}
|
||||
fputc('\n', stderr);
|
||||
}
|
||||
|
||||
static void TestConverter(const char *filename)
|
||||
{
|
||||
void *data;
|
||||
size_t datasz;
|
||||
Ptr datap;
|
||||
Handle datah;
|
||||
struct Converter cf, cr;
|
||||
struct ConverterState st;
|
||||
int r, i, j, jmax, len0, len1, len2;
|
||||
OSErr err;
|
||||
UInt8 *ptr;
|
||||
const UInt8 *iptr, *iend;
|
||||
UInt8 *optr, *oend;
|
||||
|
||||
data = NULL;
|
||||
cf.data = NULL;
|
||||
cr.data = NULL;
|
||||
|
||||
StringPrintf(gTestName, sizeof(gTestName), "%s", filename);
|
||||
|
||||
/* Load the converter into memory and build the conversion table. */
|
||||
ReadFile(filename, &data, &datasz);
|
||||
datap = data;
|
||||
datah = &datap;
|
||||
r = ConverterBuild(&cf, datah, datasz, kToUTF8, &err);
|
||||
if (r != 0) {
|
||||
Failf("ConverterBuild: %s (to UTF-8): %s", filename, ErrorName(r));
|
||||
goto done;
|
||||
}
|
||||
r = ConverterBuild(&cr, datah, datasz, kFromUTF8, &err);
|
||||
if (r != 0) {
|
||||
Failf("ConverterBuild: %s (from UTF-8): %s", filename, ErrorName(r));
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Create sample data to convert: 0-255, followed by 0. */
|
||||
len0 = 257;
|
||||
ptr = gBuffer[0];
|
||||
for (i = 0; i < 256; i++) {
|
||||
ptr[i] = i;
|
||||
}
|
||||
ptr[256] = 0;
|
||||
|
||||
/* Convert sample data. */
|
||||
iptr = gBuffer[0];
|
||||
iend = iptr + 257;
|
||||
optr = gBuffer[1];
|
||||
oend = optr + kConvertBufferSize;
|
||||
st.data = 0;
|
||||
cf.run(*cf.data, kLineBreakKeep, &st, &optr, oend, &iptr, iend);
|
||||
if (iptr != iend) {
|
||||
Failf("some data failed to convert");
|
||||
goto done;
|
||||
}
|
||||
len1 = optr - gBuffer[1];
|
||||
|
||||
/* Convert back, in three calls. The middle call will be to a 1-4 byte slice
|
||||
in the middle. */
|
||||
for (i = 1; i < len1 - 2; i++) {
|
||||
jmax = len1 - i;
|
||||
if (jmax > 4) {
|
||||
jmax = 4;
|
||||
}
|
||||
for (j = 1; j <= jmax; j++) {
|
||||
StringPrintf(gTestName, sizeof(gTestName), "%s reverse i=%d j=%d",
|
||||
filename, i, j);
|
||||
st.data = 0;
|
||||
iptr = gBuffer[1];
|
||||
optr = gBuffer[2];
|
||||
oend = optr + kConvertBufferSize;
|
||||
iend = gBuffer[1] + i;
|
||||
cr.run(*cr.data, kLineBreakKeep, &st, &optr, oend, &iptr, iend);
|
||||
iend = gBuffer[1] + i + j;
|
||||
cr.run(*cr.data, kLineBreakKeep, &st, &optr, oend, &iptr, iend);
|
||||
iend = gBuffer[1] + len1;
|
||||
cr.run(*cr.data, kLineBreakKeep, &st, &optr, oend, &iptr, iend);
|
||||
if (iptr != iend) {
|
||||
Failf("some data failed to convert");
|
||||
continue;
|
||||
}
|
||||
len2 = optr - gBuffer[2];
|
||||
Check(len0, len1, len2);
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
free(data);
|
||||
if (cf.data != NULL) {
|
||||
DisposeHandle(cf.data);
|
||||
}
|
||||
if (cr.data != NULL) {
|
||||
DisposeHandle(cr.data);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
void *buf;
|
||||
const char *filename;
|
||||
int i;
|
||||
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
for (i = 0; i < 3; i++) {
|
||||
buf = malloc(kConvertBufferSize);
|
||||
if (buf == NULL) {
|
||||
DieErrorf(errno, "malloc");
|
||||
}
|
||||
gBuffer[i] = buf;
|
||||
}
|
||||
|
||||
for (i = 0;; i++) {
|
||||
filename = kCharsetFilename[i];
|
||||
if (filename == NULL) {
|
||||
break;
|
||||
}
|
||||
TestConverter(filename);
|
||||
}
|
||||
|
||||
for (i = 0; i < 3; i++) {
|
||||
free(gBuffer[i]);
|
||||
}
|
||||
|
||||
return gFailCount == 0 ? 0 : 1;
|
||||
}
|
17
src/test.h
Normal file
17
src/test.h
Normal file
@ -0,0 +1,17 @@
|
||||
#ifndef test_h
|
||||
#define test_h
|
||||
/* test.h - unit testing definitions. */
|
||||
|
||||
#include "src/defs.h"
|
||||
|
||||
/* List of all data files, terminated by NULL. */
|
||||
extern const char *const kCharsetFilename[];
|
||||
|
||||
/* Print an error message and exit. */
|
||||
void Dief(const char *msg, ...) __attribute__((noreturn, format(printf, 1, 2)));
|
||||
|
||||
/* Print an error message with an error code and exit. */
|
||||
void DieErrorf(int errcode, const char *msg, ...)
|
||||
__attribute__((noreturn, format(printf, 2, 3)));
|
||||
|
||||
#endif
|
@ -3,17 +3,16 @@
|
||||
This is used to run conversion tests on non-Mac OS systems to make
|
||||
development easier. These are not intended to make it possible to port the
|
||||
converter to non-Mac OS systems. */
|
||||
#include "defs.h"
|
||||
#include "src/defs.h"
|
||||
#include "src/test.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static void Dief(const char *msg, ...)
|
||||
__attribute__((noreturn, format(printf, 1, 2)));
|
||||
|
||||
static void Dief(const char *msg, ...) {
|
||||
void Dief(const char *msg, ...)
|
||||
{
|
||||
va_list ap;
|
||||
fputs("Error: ", stderr);
|
||||
va_start(ap, msg);
|
||||
@ -23,7 +22,21 @@ static void Dief(const char *msg, ...) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
Handle NewHandle(Size byteCount) {
|
||||
void DieErrorf(int errcode, const char *msg, ...)
|
||||
{
|
||||
va_list ap;
|
||||
fputs("Error: ", stderr);
|
||||
va_start(ap, msg);
|
||||
vfprintf(stderr, msg, ap);
|
||||
va_end(ap);
|
||||
fputs(": ", stderr);
|
||||
fputs(strerror(errcode), stderr);
|
||||
fputc('\n', stderr);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
Handle NewHandle(Size byteCount)
|
||||
{
|
||||
Ptr p;
|
||||
Handle h;
|
||||
|
||||
@ -42,22 +55,26 @@ Handle NewHandle(Size byteCount) {
|
||||
return h;
|
||||
}
|
||||
|
||||
void HLock(Handle h) {
|
||||
void HLock(Handle h)
|
||||
{
|
||||
(void)h;
|
||||
}
|
||||
|
||||
void HUnlock(Handle h) {
|
||||
void HUnlock(Handle h)
|
||||
{
|
||||
(void)h;
|
||||
}
|
||||
|
||||
void DisposeHandle(Handle h) {
|
||||
void DisposeHandle(Handle h)
|
||||
{
|
||||
if (h != NULL) {
|
||||
free(*h);
|
||||
free(h);
|
||||
}
|
||||
}
|
||||
|
||||
void SetHandleSize(Handle h, Size newSize) {
|
||||
void SetHandleSize(Handle h, Size newSize)
|
||||
{
|
||||
Ptr p;
|
||||
if (h == NULL) {
|
||||
Dief("SetHandleSize: h = NULL");
|
||||
@ -69,11 +86,13 @@ void SetHandleSize(Handle h, Size newSize) {
|
||||
*h = p;
|
||||
}
|
||||
|
||||
OSErr MemError(void) {
|
||||
OSErr MemError(void)
|
||||
{
|
||||
/* Memory allocation failures abort the program. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
void MemClear(void *ptr, Size size) {
|
||||
void MemClear(void *ptr, Size size)
|
||||
{
|
||||
memset(ptr, 0, size);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user