mirror of
https://github.com/antoinevignau/source.git
synced 2024-10-31 22:06:40 +00:00
1 line
16 KiB
C
Executable File
1 line
16 KiB
C
Executable File
/***********************************************************************\
|
|
|
|
Filename: phon.c
|
|
|
|
\***********************************************************************/
|
|
|
|
#include "environ.h"
|
|
#include "string.h"
|
|
/*#include "ctype.h"*/
|
|
|
|
/* This routine converts a string into the two-character representation of
|
|
the phonetically encoded string. The input string is matched against the
|
|
phonetic rules and the translated results are copied to the output
|
|
string. If the word is a special then the E_LRSEP (and a fill character)
|
|
and the second part of the flagged string are copied to the output
|
|
string. */
|
|
|
|
#define PH_FOLD 'X' /* conversion from character to index */
|
|
#define PH_START 'Y' /* start of word character */
|
|
#define PH_END 'Z' /* end of word character */
|
|
#define PH_NOFLAG 0x7F /* no rule end here */
|
|
|
|
VOID
|
|
phencode(instr, outptr0)
|
|
char *instr; /* The word to be encoded */
|
|
char *outptr0; /* The encoded string */
|
|
{
|
|
int node; /* a node index */
|
|
int cc; /* the current character */
|
|
char *substr; /* portion of the original input word */
|
|
/* also pointer into string array */
|
|
char *word; /* input word with start/stop delimiters */
|
|
char *outptr = outptr0;
|
|
int matched; /* number of characters matched */
|
|
int index; /* phonetic character or index into str tab */
|
|
char inbuf[LONGWORD + 2]; /* Buffer for copy of input word */
|
|
|
|
/* Add start of word (PH_START) and end of word (PH_END) indicators
|
|
to input word. If input characters are not valid for the
|
|
language, make them lower case and remove accent marks if
|
|
necessary. Node is used as a temp. */
|
|
|
|
word = inbuf;
|
|
*word++ = PH_START;
|
|
while (*instr && *instr != E_LRSEP)
|
|
{
|
|
cc = ctoi(*instr++);
|
|
if (!scvalid(cc))
|
|
{
|
|
node = tolower(cc);
|
|
cc = scvalid(node) ? node : cc;
|
|
}
|
|
*word++ = cc;
|
|
}
|
|
*word++ = PH_END;
|
|
*word = 0;
|
|
|
|
/* Begin search for each character of word */
|
|
|
|
word = inbuf;
|
|
while (*word && *word != PH_END)
|
|
{
|
|
matched = 0;
|
|
substr = word;
|
|
node = 0;
|
|
while (cc = ctoi(*substr++))
|
|
{
|
|
/* Have a character, so go to the node for the
|
|
character. The first line checks for a link to
|
|
the next character. The next prevents a
|
|
reference outside the node list. The third checks
|
|
that the node belongs to the state. If the
|
|
search is successful, record the number of
|
|
characters matched and the current node. */
|
|
|
|
cc = (cc - PH_FOLD) & 0xFF;
|
|
if ((node = Engnlink[node]) == 0
|
|
|| (node += cc) >= PH_SIZE
|
|
|| ctoi(Engnchar[node]) != cc)
|
|
break;
|
|
else if (Engnindex[node] != PH_NOFLAG)
|
|
{
|
|
matched = substr - word;
|
|
index = node;
|
|
}
|
|
}
|
|
|
|
/* If we are here because we have reached the end of the
|
|
string, record the number of characters matched and the
|
|
index. */
|
|
|
|
/* Convert the matched portion of the string:
|
|
|
|
If no character was matched, the default is character
|
|
into character E_FILL. If the character only marked the
|
|
start of the string (PH_START), ignore it. */
|
|
|
|
if (!matched)
|
|
{
|
|
if (*word != PH_START)
|
|
{
|
|
*outptr++ = *word;
|
|
*outptr++ = E_FILL;
|
|
}
|
|
++word;
|
|
continue;
|
|
}
|
|
|
|
/* Store actual index */
|
|
|
|
index = Engnindex[index];
|
|
|
|
/* If the index is less than 10, it represents a phonetic
|
|
character. Ignore the start and end characters. If only
|
|
one character was matched, converted string is input
|
|
character, phonetic character (converted index). If not,
|
|
two characters were matched (a repeat) which becomes
|
|
input character, phonetic character, E_REPEAT, E_FILL. */
|
|
|
|
if (index < 10)
|
|
{
|
|
if (*word == PH_START)
|
|
{
|
|
--matched;
|
|
++word;
|
|
}
|
|
*outptr++ = *word++;
|
|
*outptr++ = index + E_LOPH;
|
|
if (matched == 2 && *word != PH_END)
|
|
{
|
|
*outptr++ = E_REPEAT;
|
|
*outptr++ = E_FILL;
|
|
++word;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
/* Index into string table. Subtract 10 from index to
|
|
adjust. If two characters were matched, index is
|
|
into the first part of the string table, otherwise
|
|
add the offset. */
|
|
|
|
substr = (char *) (Engstr + index - 10);
|
|
if (matched != 2)
|
|
substr += PH_SEP;
|
|
|
|
/* There are two types of strings in the table. Those which
|
|
should be combined with the original string, and those
|
|
which represent the entire phonetic translation. The
|
|
first type will begin with characters less than 'A'. The
|
|
phonetic string is created by alternating characters from
|
|
the original input string with these characters. Start
|
|
(PH_START) and stop (PH_END) characters are ignored. For
|
|
the other type, the string is simply copied. */
|
|
|
|
if (*substr < 'A')
|
|
{
|
|
while (--matched >= 0)
|
|
{
|
|
if (*word == PH_START)
|
|
{
|
|
++word;
|
|
continue;
|
|
}
|
|
if (*word != PH_END)
|
|
*outptr++ = *word++;
|
|
*outptr++ = *substr++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
while (*outptr++ = *substr++)
|
|
;
|
|
--outptr;
|
|
word += matched;
|
|
}
|
|
}
|
|
|
|
/* If the input string was special, copy the second half to
|
|
the output string */
|
|
|
|
if (*instr++ == E_LRSEP)
|
|
{
|
|
*outptr++ = E_LRSEP;
|
|
*outptr++ = E_FILL;
|
|
while (*outptr++ = *instr++)
|
|
;
|
|
}
|
|
else
|
|
*outptr = 0;
|
|
}
|
|
|
|
/* Compute the bin characters from an essence. */
|
|
|
|
VOID
|
|
phprepend(essptr, bin)
|
|
char *essptr;
|
|
char *bin;
|
|
{
|
|
int i;
|
|
int minval; /* lower limit on table search */
|
|
int maxval; /* upper limit on table search */
|
|
|
|
/* binary-search the bin table for the essence, making minval the
|
|
appropriate index into the table */
|
|
|
|
/* printf("essence is "); hexprint(essptr); printf("\n"); */
|
|
|
|
minval = 0;
|
|
maxval = NBIN - 1;
|
|
|
|
while (minval < maxval)
|
|
{
|
|
/* printf("ph bin %d: ", i); hexprint(Engphbintab[i]); printf("\n"); */
|
|
i = (maxval + minval + 1) >> 1;
|
|
if (strcmp(essptr, Engphbintab[i]) >= 0)
|
|
minval = i;
|
|
else
|
|
maxval = i - 1;
|
|
}
|
|
|
|
/* printf("ph bin is %d: ", i); hexprint(Engphbintab[i]); printf("\n"); */
|
|
|
|
/* convert minval to a 2-digit number in base numalph, with
|
|
alphabetic phonetic characters as the "digits"; copy the 2
|
|
characters into bin and null-terminate it */
|
|
|
|
*bin++ = MINALPH + minval / NUMALPH;
|
|
*bin++ = MINALPH + minval % NUMALPH;
|
|
*bin = 0;
|
|
|
|
/* printf("ph bin chars: %02x %02x\n", *(bin - 2), *(bin - 1)); */
|
|
}
|
|
|
|
/* Return the bin for a word. */
|
|
|
|
VOID
|
|
phbin(str, bin)
|
|
char *str; /* The string to find the bin */
|
|
char *bin; /* The bin */
|
|
{
|
|
int cc; /* character in str */
|
|
char *essptr; /* pointer into essence buffer */
|
|
int prevc; /* previous group phonetic char */
|
|
char essence[LONGWORD];
|
|
char encbuf[MAXPHWORD];
|
|
|
|
/* printf("Essence is "); */
|
|
|
|
/* Compute the essence for the word. */
|
|
|
|
prevc = E_LOGPH;
|
|
essptr = essence;
|
|
while ((cc = *str++) && cc != E_LRSEP)
|
|
{
|
|
cc = ctoi(*str++);
|
|
|
|
/* phonetic character -- again, ignore fill charact
|
|
character out of range */
|
|
|
|
if (cc < E_LOPH || cc > E_HIPH)
|
|
continue;
|
|
if (cc >= E_LOGPH && cc <= E_HIGPH)
|
|
{
|
|
if (cc == prevc)
|
|
continue;
|
|
prevc = cc;
|
|
if (cc == E_VOWEL)
|
|
continue;
|
|
}
|
|
*essptr++ = cc;
|
|
/* printf("%c", *(str - 2)); */
|
|
}
|
|
*essptr = 0;
|
|
|
|
/* printf("\n");
|
|
gets(dummy); */
|
|
|
|
/* Compute the bin characters from the essence. */
|
|
|
|
phprepend(essence, bin);
|
|
}
|
|
|
|
/* Convert a string from the two byte phonetic form to the one byte phonetic
|
|
form. Return the result in the input string. Non compressible characters
|
|
in specials are replaced with the code for E_FILL,E_FILL. The right half
|
|
of a special is copied as is. */
|
|
|
|
VOID
|
|
phcompress(str0)
|
|
char *str0; /* The string to compress */
|
|
{
|
|
char *pairptr;/* one pair in phonetic table */
|
|
char *pairend; /* end of phonetic table */
|
|
char *str = str0;
|
|
char *cmpstr; /* pointer into compressed string */
|
|
char alph; /* alpha part of pair */
|
|
char phon; /* phon part of char */
|
|
|
|
cmpstr = str;
|
|
pairend = Engphtab[0] + (NPHON << 1);
|
|
while (*str)
|
|
{
|
|
alph = *str++;
|
|
phon = *str++;
|
|
|
|
/* Search the phonetic table for an entry that matches the
|
|
pair of characters. If one is not found, return the
|
|
index for fills. */
|
|
|
|
pairptr = Engphtab[0];
|
|
while (1) {
|
|
if (pairptr == pairend) {
|
|
*cmpstr++ = EFILLIND;
|
|
break;
|
|
}
|
|
if (alph == pairptr[0] && phon == pairptr[1]) {
|
|
*cmpstr++ = (pairptr - Engphtab[0])
|
|
>> 1;
|
|
break;
|
|
}
|
|
pairptr += 2;
|
|
}
|
|
/* If the character is the special separator, copy the right
|
|
half of the special. */
|
|
|
|
if (alph == E_LRSEP) {
|
|
strecpy(cmpstr, str);
|
|
return;
|
|
}
|
|
}
|
|
*cmpstr = 0;
|
|
}
|
|
|
|
/* Create the fully encoded form of a word. This is used to look up a word
|
|
in a lexicon or CLAM. `shortflag' is used for lexicon lookup when the
|
|
lexicon contains no binning information. */
|
|
|
|
VOID
|
|
phfull(source, dest, shortflag)
|
|
char *source; /* The string to encode */
|
|
char *dest; /* The output string */
|
|
int shortflag; /* Encode for a single bin */
|
|
{
|
|
char peword[MAXPHWORD]; /* phonetically encoded word buffer */
|
|
|
|
/* Translate the word into the two-character phonetic form. */
|
|
|
|
phencode(source, peword);
|
|
|
|
/* Get the bin characters for the word. */
|
|
|
|
if (!shortflag)
|
|
phbin(peword, dest);
|
|
else
|
|
dest[0] = dest[1] = MINALPH;
|
|
|
|
/* Compress the word into one character phonetics. */
|
|
|
|
phcompress(peword);
|
|
|
|
strecpy(dest + 2, peword);
|
|
}
|
|
|
|
/* Convert a word into its canonical form and return its flag. */
|
|
|
|
int doflags(word0, outword, inlen)
|
|
char *word0; /* the word to convert */
|
|
char *outword; /* the place for the converted word */
|
|
int inlen; /* maximum input word length */
|
|
{
|
|
char *word; /* pointer into word */
|
|
int cc; /* character temp */
|
|
int flags; /* used to accumulate the word's flags */
|
|
int dot; /* the word contains a dot */
|
|
int alldots; /* the word is alternating letters and dots */
|
|
int ncaps; /* the number of upper case letters */
|
|
int allcaps; /* all letters are upper case */
|
|
int chars; /* number of characters */
|
|
|
|
/* Initialize everything */
|
|
|
|
word = word0;
|
|
if (word[0] == 0) {
|
|
return (ERROR);
|
|
}
|
|
flags = chars = ncaps = 0;
|
|
allcaps = alldots = TRUE;
|
|
dot = FALSE;
|
|
|
|
/* Scan the word and test the type of the word. */
|
|
|
|
while (cc = ctoi(*word++)) {
|
|
++chars;
|
|
if (cc == E_LRSEP) {
|
|
return (ERROR);
|
|
}
|
|
if (!scvalid(cc) || cc > 0x80) { /* high-ascii hack */
|
|
flags = IW_SPECIAL;
|
|
alldots = FALSE;
|
|
continue;
|
|
}
|
|
if (isupper(cc)) {
|
|
++ncaps;
|
|
}
|
|
if (islower(cc)) {
|
|
allcaps = FALSE;
|
|
}
|
|
if (chars & 1) {
|
|
/* This is a character in an even position; if it is
|
|
a dot then this word is not an ALLDOT word. */
|
|
|
|
if (cc == '.') {
|
|
alldots = FALSE;
|
|
if (*word) {
|
|
dot = TRUE;
|
|
}
|
|
}
|
|
} else {
|
|
/* This character is in an odd position; if it is
|
|
not a dot then this word is not ALLDOT. */
|
|
|
|
if (cc != '.') {
|
|
alldots = FALSE;
|
|
} else if (*word) {
|
|
dot = TRUE;
|
|
}
|
|
}
|
|
}
|
|
/* Return with error if the there are too many input characters. */
|
|
|
|
if (chars >= inlen) {
|
|
return (ERROR);
|
|
}
|
|
/* Assign those flags which depend on the dots in the word. */
|
|
|
|
word -= 2;
|
|
if (dot && alldots && *word == '.') {
|
|
flags = IW_ALLDOT;
|
|
} else {
|
|
if (dot) {
|
|
flags = IW_SPECIAL;
|
|
}
|
|
if (*word == '.') {
|
|
flags |= IW_ENDDOT;
|
|
}
|
|
}
|
|
/* Assign those flags which depend on the capitalization of the
|
|
word. */
|
|
|
|
if (allcaps && ncaps) {
|
|
flags |= IW_ACRONYM;
|
|
} else if (isupper(ctoi(*word0)) && ncaps == 1) {
|
|
flags |= IW_PROPER;
|
|
} else if (ncaps == 0) {
|
|
flags |= IW_COMMON;
|
|
} else {
|
|
flags |= IW_SPECIAL;
|
|
}
|
|
/* Copy the word to the output buffer. Translate upper case to
|
|
lower case. Remove dots unless the word is a special. */
|
|
|
|
word = word0;
|
|
while (cc = tolower(ctoi(*word)), *word++) {
|
|
if (cc != '.' || flags & IW_SPECIAL) {
|
|
*outword++ = cc;
|
|
}
|
|
}
|
|
/* Generate the remaining part of the output word for specials.
|
|
This consists of E_LRSEP followed by the input word. If the
|
|
input word has a trailing dot then strip this from both parts of
|
|
the special and set the return flag to indicate the trailing
|
|
dot. */
|
|
|
|
if (flags & IW_SPECIAL) {
|
|
|
|
flags &= IW_SPECIAL | IW_ENDDOT;
|
|
if (flags & IW_ENDDOT) {
|
|
--outword;
|
|
}
|
|
*outword++ = E_LRSEP;
|
|
|
|
word = word0;
|
|
while (*outword++ = *word++)
|
|
;
|
|
--outword;
|
|
if (flags & IW_ENDDOT) {
|
|
--outword;
|
|
}
|
|
}
|
|
/* Terminate the output string and return the word's flags */
|
|
|
|
*outword = 0;
|
|
|
|
return (flags);
|
|
}
|
|
|
|
/* 'Undoflags' reconstitutes a word from the string and the flag byte
|
|
produced by 'doflags'. It does this as expected for all types other than
|
|
specials. For specials, it assumes that the word presented to it has had
|
|
the text up to and including the E_LRSEP character removed, and so it
|
|
just copies the string and adds the trailing dot for IW_ENDDOT words. */
|
|
|
|
VOID
|
|
undoflags(word, flags, outword)
|
|
char *word; /* The word to convert */
|
|
int flags; /* The flags for the word */
|
|
char *outword; /* The output word */
|
|
{
|
|
int c;
|
|
|
|
/* For specials, copy the entire string and then add a terminal dot,
|
|
if required. */
|
|
|
|
if (flags & IW_SPECIAL) {
|
|
while (*outword++ = *word++)
|
|
;
|
|
if (flags & IW_ENDDOT) {
|
|
*(outword - 1) = '.';
|
|
*outword = 0;
|
|
}
|
|
return;
|
|
}
|
|
/* Re-create the word, if it is not a special. The first thing to
|
|
do is to make the initial letter a capital if it was originally.
|
|
Then add the dot after this letter if the word is an alldot
|
|
word. */
|
|
|
|
if ((flags & IW_CASE) == IW_PROPER) {
|
|
c = ctoi(*word++);
|
|
*outword++ = _toupper(c);
|
|
|
|
/* restore a dot if IW_ALLDOT */
|
|
|
|
if (flags & IW_ALLDOT) {
|
|
*outword++ = '.';
|
|
}
|
|
}
|
|
/* The next thing to do is to copy the remaining characters to the
|
|
output string, processing as required. If the flag has
|
|
IW_ACRONYM set then make all letters upper case. If the flag has
|
|
IW_ALLDOT set then append a dot to every letter. After the word
|
|
has been copied, if the flag has IW_ENDDOT set then add a dot to
|
|
the word. Note that this assumes that IW_ALLDOT and IW_ALLDOT
|
|
are not both set. If they are then this will break. */
|
|
|
|
while (c = ctoi(*word++)) {
|
|
if (islower(c) && (flags & IW_CASE) == IW_ACRONYM) {
|
|
c = _toupper(c);
|
|
}
|
|
*outword++ = c;
|
|
|
|
if (flags & IW_ALLDOT) {
|
|
*outword++ = '.';
|
|
}
|
|
}
|
|
if (flags & IW_ENDDOT) {
|
|
*outword++ = '.';
|
|
}
|
|
*outword = 0;
|
|
}
|