antoine-source/appleworksgs/Spell/Src/PHON.C
2023-03-04 03:45:20 +01:00

1 line
16 KiB
C
Executable File

/***********************************************************************\
Filename: phon.c
\***********************************************************************/
#include "environ.h"
#include "string.h"
/*#include "ctype.h"*/
/* This routine converts a string into the two-character representation of
the phonetically encoded string. The input string is matched against the
phonetic rules and the translated results are copied to the output
string. If the word is a special then the E_LRSEP (and a fill character)
and the second part of the flagged string are copied to the output
string. */
#define PH_FOLD 'X' /* conversion from character to index */
#define PH_START 'Y' /* start of word character */
#define PH_END 'Z' /* end of word character */
#define PH_NOFLAG 0x7F /* no rule end here */
VOID
phencode(instr, outptr0)
char *instr; /* The word to be encoded */
char *outptr0; /* The encoded string */
{
int node; /* a node index */
int cc; /* the current character */
char *substr; /* portion of the original input word */
/* also pointer into string array */
char *word; /* input word with start/stop delimiters */
char *outptr = outptr0;
int matched; /* number of characters matched */
int index; /* phonetic character or index into str tab */
char inbuf[LONGWORD + 2]; /* Buffer for copy of input word */
/* Add start of word (PH_START) and end of word (PH_END) indicators
to input word. If input characters are not valid for the
language, make them lower case and remove accent marks if
necessary. Node is used as a temp. */
word = inbuf;
*word++ = PH_START;
while (*instr && *instr != E_LRSEP)
{
cc = ctoi(*instr++);
if (!scvalid(cc))
{
node = tolower(cc);
cc = scvalid(node) ? node : cc;
}
*word++ = cc;
}
*word++ = PH_END;
*word = 0;
/* Begin search for each character of word */
word = inbuf;
while (*word && *word != PH_END)
{
matched = 0;
substr = word;
node = 0;
while (cc = ctoi(*substr++))
{
/* Have a character, so go to the node for the
character. The first line checks for a link to
the next character. The next prevents a
reference outside the node list. The third checks
that the node belongs to the state. If the
search is successful, record the number of
characters matched and the current node. */
cc = (cc - PH_FOLD) & 0xFF;
if ((node = Engnlink[node]) == 0
|| (node += cc) >= PH_SIZE
|| ctoi(Engnchar[node]) != cc)
break;
else if (Engnindex[node] != PH_NOFLAG)
{
matched = substr - word;
index = node;
}
}
/* If we are here because we have reached the end of the
string, record the number of characters matched and the
index. */
/* Convert the matched portion of the string:
If no character was matched, the default is character
into character E_FILL. If the character only marked the
start of the string (PH_START), ignore it. */
if (!matched)
{
if (*word != PH_START)
{
*outptr++ = *word;
*outptr++ = E_FILL;
}
++word;
continue;
}
/* Store actual index */
index = Engnindex[index];
/* If the index is less than 10, it represents a phonetic
character. Ignore the start and end characters. If only
one character was matched, converted string is input
character, phonetic character (converted index). If not,
two characters were matched (a repeat) which becomes
input character, phonetic character, E_REPEAT, E_FILL. */
if (index < 10)
{
if (*word == PH_START)
{
--matched;
++word;
}
*outptr++ = *word++;
*outptr++ = index + E_LOPH;
if (matched == 2 && *word != PH_END)
{
*outptr++ = E_REPEAT;
*outptr++ = E_FILL;
++word;
}
continue;
}
/* Index into string table. Subtract 10 from index to
adjust. If two characters were matched, index is
into the first part of the string table, otherwise
add the offset. */
substr = (char *) (Engstr + index - 10);
if (matched != 2)
substr += PH_SEP;
/* There are two types of strings in the table. Those which
should be combined with the original string, and those
which represent the entire phonetic translation. The
first type will begin with characters less than 'A'. The
phonetic string is created by alternating characters from
the original input string with these characters. Start
(PH_START) and stop (PH_END) characters are ignored. For
the other type, the string is simply copied. */
if (*substr < 'A')
{
while (--matched >= 0)
{
if (*word == PH_START)
{
++word;
continue;
}
if (*word != PH_END)
*outptr++ = *word++;
*outptr++ = *substr++;
}
}
else
{
while (*outptr++ = *substr++)
;
--outptr;
word += matched;
}
}
/* If the input string was special, copy the second half to
the output string */
if (*instr++ == E_LRSEP)
{
*outptr++ = E_LRSEP;
*outptr++ = E_FILL;
while (*outptr++ = *instr++)
;
}
else
*outptr = 0;
}
/* Compute the bin characters from an essence. */
VOID
phprepend(essptr, bin)
char *essptr;
char *bin;
{
int i;
int minval; /* lower limit on table search */
int maxval; /* upper limit on table search */
/* binary-search the bin table for the essence, making minval the
appropriate index into the table */
/* printf("essence is "); hexprint(essptr); printf("\n"); */
minval = 0;
maxval = NBIN - 1;
while (minval < maxval)
{
/* printf("ph bin %d: ", i); hexprint(Engphbintab[i]); printf("\n"); */
i = (maxval + minval + 1) >> 1;
if (strcmp(essptr, Engphbintab[i]) >= 0)
minval = i;
else
maxval = i - 1;
}
/* printf("ph bin is %d: ", i); hexprint(Engphbintab[i]); printf("\n"); */
/* convert minval to a 2-digit number in base numalph, with
alphabetic phonetic characters as the "digits"; copy the 2
characters into bin and null-terminate it */
*bin++ = MINALPH + minval / NUMALPH;
*bin++ = MINALPH + minval % NUMALPH;
*bin = 0;
/* printf("ph bin chars: %02x %02x\n", *(bin - 2), *(bin - 1)); */
}
/* Return the bin for a word. */
VOID
phbin(str, bin)
char *str; /* The string to find the bin */
char *bin; /* The bin */
{
int cc; /* character in str */
char *essptr; /* pointer into essence buffer */
int prevc; /* previous group phonetic char */
char essence[LONGWORD];
char encbuf[MAXPHWORD];
/* printf("Essence is "); */
/* Compute the essence for the word. */
prevc = E_LOGPH;
essptr = essence;
while ((cc = *str++) && cc != E_LRSEP)
{
cc = ctoi(*str++);
/* phonetic character -- again, ignore fill charact
character out of range */
if (cc < E_LOPH || cc > E_HIPH)
continue;
if (cc >= E_LOGPH && cc <= E_HIGPH)
{
if (cc == prevc)
continue;
prevc = cc;
if (cc == E_VOWEL)
continue;
}
*essptr++ = cc;
/* printf("%c", *(str - 2)); */
}
*essptr = 0;
/* printf("\n");
gets(dummy); */
/* Compute the bin characters from the essence. */
phprepend(essence, bin);
}
/* Convert a string from the two byte phonetic form to the one byte phonetic
form. Return the result in the input string. Non compressible characters
in specials are replaced with the code for E_FILL,E_FILL. The right half
of a special is copied as is. */
VOID
phcompress(str0)
char *str0; /* The string to compress */
{
char *pairptr;/* one pair in phonetic table */
char *pairend; /* end of phonetic table */
char *str = str0;
char *cmpstr; /* pointer into compressed string */
char alph; /* alpha part of pair */
char phon; /* phon part of char */
cmpstr = str;
pairend = Engphtab[0] + (NPHON << 1);
while (*str)
{
alph = *str++;
phon = *str++;
/* Search the phonetic table for an entry that matches the
pair of characters. If one is not found, return the
index for fills. */
pairptr = Engphtab[0];
while (1) {
if (pairptr == pairend) {
*cmpstr++ = EFILLIND;
break;
}
if (alph == pairptr[0] && phon == pairptr[1]) {
*cmpstr++ = (pairptr - Engphtab[0])
>> 1;
break;
}
pairptr += 2;
}
/* If the character is the special separator, copy the right
half of the special. */
if (alph == E_LRSEP) {
strecpy(cmpstr, str);
return;
}
}
*cmpstr = 0;
}
/* Create the fully encoded form of a word. This is used to look up a word
in a lexicon or CLAM. `shortflag' is used for lexicon lookup when the
lexicon contains no binning information. */
VOID
phfull(source, dest, shortflag)
char *source; /* The string to encode */
char *dest; /* The output string */
int shortflag; /* Encode for a single bin */
{
char peword[MAXPHWORD]; /* phonetically encoded word buffer */
/* Translate the word into the two-character phonetic form. */
phencode(source, peword);
/* Get the bin characters for the word. */
if (!shortflag)
phbin(peword, dest);
else
dest[0] = dest[1] = MINALPH;
/* Compress the word into one character phonetics. */
phcompress(peword);
strecpy(dest + 2, peword);
}
/* Convert a word into its canonical form and return its flag. */
int doflags(word0, outword, inlen)
char *word0; /* the word to convert */
char *outword; /* the place for the converted word */
int inlen; /* maximum input word length */
{
char *word; /* pointer into word */
int cc; /* character temp */
int flags; /* used to accumulate the word's flags */
int dot; /* the word contains a dot */
int alldots; /* the word is alternating letters and dots */
int ncaps; /* the number of upper case letters */
int allcaps; /* all letters are upper case */
int chars; /* number of characters */
/* Initialize everything */
word = word0;
if (word[0] == 0) {
return (ERROR);
}
flags = chars = ncaps = 0;
allcaps = alldots = TRUE;
dot = FALSE;
/* Scan the word and test the type of the word. */
while (cc = ctoi(*word++)) {
++chars;
if (cc == E_LRSEP) {
return (ERROR);
}
if (!scvalid(cc) || cc > 0x80) { /* high-ascii hack */
flags = IW_SPECIAL;
alldots = FALSE;
continue;
}
if (isupper(cc)) {
++ncaps;
}
if (islower(cc)) {
allcaps = FALSE;
}
if (chars & 1) {
/* This is a character in an even position; if it is
a dot then this word is not an ALLDOT word. */
if (cc == '.') {
alldots = FALSE;
if (*word) {
dot = TRUE;
}
}
} else {
/* This character is in an odd position; if it is
not a dot then this word is not ALLDOT. */
if (cc != '.') {
alldots = FALSE;
} else if (*word) {
dot = TRUE;
}
}
}
/* Return with error if the there are too many input characters. */
if (chars >= inlen) {
return (ERROR);
}
/* Assign those flags which depend on the dots in the word. */
word -= 2;
if (dot && alldots && *word == '.') {
flags = IW_ALLDOT;
} else {
if (dot) {
flags = IW_SPECIAL;
}
if (*word == '.') {
flags |= IW_ENDDOT;
}
}
/* Assign those flags which depend on the capitalization of the
word. */
if (allcaps && ncaps) {
flags |= IW_ACRONYM;
} else if (isupper(ctoi(*word0)) && ncaps == 1) {
flags |= IW_PROPER;
} else if (ncaps == 0) {
flags |= IW_COMMON;
} else {
flags |= IW_SPECIAL;
}
/* Copy the word to the output buffer. Translate upper case to
lower case. Remove dots unless the word is a special. */
word = word0;
while (cc = tolower(ctoi(*word)), *word++) {
if (cc != '.' || flags & IW_SPECIAL) {
*outword++ = cc;
}
}
/* Generate the remaining part of the output word for specials.
This consists of E_LRSEP followed by the input word. If the
input word has a trailing dot then strip this from both parts of
the special and set the return flag to indicate the trailing
dot. */
if (flags & IW_SPECIAL) {
flags &= IW_SPECIAL | IW_ENDDOT;
if (flags & IW_ENDDOT) {
--outword;
}
*outword++ = E_LRSEP;
word = word0;
while (*outword++ = *word++)
;
--outword;
if (flags & IW_ENDDOT) {
--outword;
}
}
/* Terminate the output string and return the word's flags */
*outword = 0;
return (flags);
}
/* 'Undoflags' reconstitutes a word from the string and the flag byte
produced by 'doflags'. It does this as expected for all types other than
specials. For specials, it assumes that the word presented to it has had
the text up to and including the E_LRSEP character removed, and so it
just copies the string and adds the trailing dot for IW_ENDDOT words. */
VOID
undoflags(word, flags, outword)
char *word; /* The word to convert */
int flags; /* The flags for the word */
char *outword; /* The output word */
{
int c;
/* For specials, copy the entire string and then add a terminal dot,
if required. */
if (flags & IW_SPECIAL) {
while (*outword++ = *word++)
;
if (flags & IW_ENDDOT) {
*(outword - 1) = '.';
*outword = 0;
}
return;
}
/* Re-create the word, if it is not a special. The first thing to
do is to make the initial letter a capital if it was originally.
Then add the dot after this letter if the word is an alldot
word. */
if ((flags & IW_CASE) == IW_PROPER) {
c = ctoi(*word++);
*outword++ = _toupper(c);
/* restore a dot if IW_ALLDOT */
if (flags & IW_ALLDOT) {
*outword++ = '.';
}
}
/* The next thing to do is to copy the remaining characters to the
output string, processing as required. If the flag has
IW_ACRONYM set then make all letters upper case. If the flag has
IW_ALLDOT set then append a dot to every letter. After the word
has been copied, if the flag has IW_ENDDOT set then add a dot to
the word. Note that this assumes that IW_ALLDOT and IW_ALLDOT
are not both set. If they are then this will break. */
while (c = ctoi(*word++)) {
if (islower(c) && (flags & IW_CASE) == IW_ACRONYM) {
c = _toupper(c);
}
*outword++ = c;
if (flags & IW_ALLDOT) {
*outword++ = '.';
}
}
if (flags & IW_ENDDOT) {
*outword++ = '.';
}
*outword = 0;
}