antoine-source/appleworksgs/Spell/Src/PHON.C

/***********************************************************************\

   Filename: phon.c

\***********************************************************************/

#include "environ.h"
#include "string.h"
/*#include "ctype.h"*/

/* This routine converts a string into the two-character representation of
   the phonetically encoded string.  The input string is matched against the
   phonetic rules and the translated results are copied to the output
   string.  If the word is a special then the E_LRSEP (and a fill character)
   and the second part of the flagged string are copied to the output
   string.  */

#define PH_FOLD   'X'           /* conversion from character to index */
#define PH_START  'Y'           /* start of word character */
#define PH_END    'Z'           /* end of word character */
#define PH_NOFLAG 0x7F          /* no rule end here */

VOID
phencode(instr, outptr0)
char   *instr;                 /* The word to be encoded */
char   *outptr0;               /* The encoded string */
{
 int node;      /* a node index */
 int cc;        /* the current character */
 char *substr; /* portion of the original input word */
                        /* also pointer into string array */
 char *word;   /* input word with start/stop delimiters */
 char *outptr = outptr0;
int     matched;        /* number of characters matched */
int     index;          /* phonetic character or index into str tab */
char   inbuf[LONGWORD + 2];    /* Buffer for copy of input word */

   /* Add start of word (PH_START) and end of word (PH_END) indicators
      to input word.  If input characters are not valid for the
      language, make them lower case and remove accent marks if
      necessary.  Node is used as a temp.  */

   word = inbuf;
   *word++ = PH_START;
   while (*instr && *instr != E_LRSEP)
   {
       cc = ctoi(*instr++);
       if (!scvalid(cc))
       {
           node = tolower(cc);
           cc = scvalid(node) ? node : cc;
       }
       *word++ = cc;
   }
   *word++ = PH_END;
   *word = 0;

   /* Begin search for each character of word */

   word = inbuf;
   while (*word && *word != PH_END)
   {
       matched = 0;
       substr = word;
       node = 0;
       while (cc = ctoi(*substr++))
       {
           /* Have a character, so go to the node for the
              character.  The first line checks for a link to
              the next character.  The next prevents a
              reference outside the node list.  The third checks
              that the node belongs to the state.  If the
              search is successful, record the number of
              characters matched and the current node. */

           cc = (cc - PH_FOLD) & 0xFF;
           if ((node = Engnlink[node]) == 0
             || (node += cc) >= PH_SIZE
             || ctoi(Engnchar[node]) != cc)
               break;
           else if (Engnindex[node] != PH_NOFLAG)
           {
               matched = substr - word;
               index = node;
           }
       }

       /* If we are here because we have reached the end of the
          string, record the number of characters matched and the
          index.  */

       /* Convert the matched portion of the string:

          If no character was matched, the default is character
          into character E_FILL.  If the character only marked the
          start of the string (PH_START), ignore it.  */

       if (!matched)
       {
           if (*word != PH_START)
           {
               *outptr++ = *word;
               *outptr++ = E_FILL;
           }
           ++word;
           continue;
       }

       /* Store actual index */

       index = Engnindex[index];

       /* If the index is less than 10, it represents a phonetic
          character.  Ignore the start and end characters.  If only
          one character was matched, converted string is input
          character, phonetic character (converted index).  If not,
          two characters were matched (a repeat) which becomes
          input character, phonetic character, E_REPEAT, E_FILL. */

       if (index < 10)
       {
           if (*word == PH_START)
           {
               --matched;
               ++word;
           }
           *outptr++ = *word++;
           *outptr++ = index + E_LOPH;
           if (matched == 2 && *word != PH_END)
           {
               *outptr++ = E_REPEAT;
               *outptr++ = E_FILL;
               ++word;
           }
           continue;
       }

       /* Index into string table.  Subtract 10 from index to
          adjust.  If two characters were matched, index is
          into the first part of the string table, otherwise
          add the offset. */

       substr = (char *) (Engstr + index - 10);
       if (matched != 2)
           substr += PH_SEP;

       /* There are two types of strings in the table.  Those which
          should be combined with the original string, and those
          which represent the entire phonetic translation.  The
          first type will begin with characters less than 'A'.  The
          phonetic string is created by alternating characters from
          the original input string with these characters.  Start
          (PH_START) and stop (PH_END) characters are ignored.  For
          the other type, the string is simply copied.  */

       if (*substr < 'A')
       {
           while (--matched >= 0)
           {
               if (*word == PH_START)
               {
                   ++word;
                   continue;
               }
               if (*word != PH_END)
                   *outptr++ = *word++;
               *outptr++ = *substr++;
           }
       }
       else
       {
           while (*outptr++ = *substr++)
               ;
           --outptr;
           word += matched;
       }
   }

   /* If the input string was special, copy the second half to
      the output string */

   if (*instr++ == E_LRSEP)
   {
       *outptr++ = E_LRSEP;
       *outptr++ = E_FILL;
       while (*outptr++ = *instr++)
           ;
   }
   else
       *outptr = 0;
}

/* Compute the bin characters from an essence. */

VOID
phprepend(essptr, bin)
char   *essptr;
char   *bin;
{
 int i;
 int minval;    /* lower limit on table search */
int     maxval;         /* upper limit on table search */

   /* binary-search the bin table for the essence, making minval the
      appropriate index into the table */

/*      printf("essence is "); hexprint(essptr); printf("\n"); */

   minval = 0;
   maxval = NBIN - 1;

   while (minval < maxval)
   {
/*   printf("ph bin %d: ", i); hexprint(Engphbintab[i]); printf("\n"); */
       i = (maxval + minval + 1) >> 1;
       if (strcmp(essptr, Engphbintab[i]) >= 0)
           minval = i;
       else
           maxval = i - 1;
   }

/*   printf("ph bin is %d: ", i); hexprint(Engphbintab[i]); printf("\n"); */

   /* convert minval to a 2-digit number in base numalph, with
      alphabetic phonetic characters as the "digits"; copy the 2
      characters into bin and null-terminate it */

   *bin++ = MINALPH + minval / NUMALPH;
   *bin++ = MINALPH + minval % NUMALPH;
   *bin = 0;

/*   printf("ph bin chars: %02x %02x\n", *(bin - 2), *(bin - 1)); */
}

/* Return the bin for a word. */

VOID
phbin(str, bin)
char   *str;                   /* The string to find the bin */
char   *bin;                   /* The bin */
{
 int cc;        /* character in str */
 char *essptr; /* pointer into essence buffer */
int     prevc;          /* previous group phonetic char */
char   essence[LONGWORD];
char   encbuf[MAXPHWORD];

/* printf("Essence is "); */

   /* Compute the essence for the word. */

   prevc = E_LOGPH;
   essptr = essence;
   while ((cc = *str++) && cc != E_LRSEP)
   {
       cc = ctoi(*str++);

       /* phonetic character -- again, ignore fill charact
          character out of range */

       if (cc < E_LOPH || cc > E_HIPH)
           continue;
       if (cc >= E_LOGPH && cc <= E_HIGPH)
       {
           if (cc == prevc)
               continue;
           prevc = cc;
           if (cc == E_VOWEL)
               continue;
       }
       *essptr++ = cc;
/*       printf("%c", *(str - 2)); */
   }
   *essptr = 0;

/*   printf("\n");
   gets(dummy);    */

   /* Compute the bin characters from the essence. */

   phprepend(essence, bin);
}

/* Convert a string from the two byte phonetic form to the one byte phonetic
   form.  Return the result in the input string. Non compressible characters
   in specials are replaced with the code for E_FILL,E_FILL. The right half
   of a special is copied as is. */

VOID
phcompress(str0)
char   *str0;                  /* The string to compress */
{
char *pairptr;/* one pair in phonetic table */
char *pairend;        /* end of phonetic table */
char *str = str0;
char *cmpstr;         /* pointer into compressed string */
char   alph;          /* alpha part of pair */
char   phon;          /* phon part of char */

   cmpstr = str;
   pairend = Engphtab[0] + (NPHON << 1);
   while (*str)
   {
       alph = *str++;
       phon = *str++;

                /* Search the phonetic table for an entry that matches the
                   pair of characters.  If one is not found, return the
                   index for fills.  */

       pairptr = Engphtab[0];
       while (1) {
           if (pairptr == pairend) {
               *cmpstr++ = EFILLIND;
               break;
           }
           if (alph == pairptr[0] && phon == pairptr[1]) {
               *cmpstr++ = (pairptr - Engphtab[0])
                    >> 1;
               break;
           }
           pairptr += 2;
       }
       /* If the character is the special separator, copy the right
          half of the special. */

       if (alph == E_LRSEP) {
           strecpy(cmpstr, str);
           return;
       }
   }
   *cmpstr = 0;
}

/* Create the fully encoded form of a word.  This is used to look up a word
   in a lexicon or CLAM.  `shortflag' is used for lexicon lookup when the
   lexicon contains no binning information.  */

VOID
phfull(source, dest, shortflag)
char   *source;                /* The string to encode */
char   *dest;                  /* The output string */
int     shortflag;              /* Encode for a single bin */
{
char   peword[MAXPHWORD];  /* phonetically encoded word buffer */

   /* Translate the word into the two-character phonetic form. */

   phencode(source, peword);

   /* Get the bin characters for the word. */

   if (!shortflag)
       phbin(peword, dest);
   else
       dest[0] = dest[1] = MINALPH;

   /* Compress the word into one character phonetics. */

   phcompress(peword);

   strecpy(dest + 2, peword);
}

/* Convert a word into its canonical form and return its flag. */

int doflags(word0, outword, inlen)
char    *word0;                 /* the word to convert */
char    *outword;               /* the place for the converted word */
int     inlen;                  /* maximum input word length */
{
char    *word;          /* pointer into word */
int     cc;             /* character temp */
int     flags;          /* used to accumulate the word's flags */
int     dot;            /* the word contains a dot */
int     alldots;        /* the word is alternating letters and dots */
int     ncaps;          /* the number of upper case letters */
int     allcaps;        /* all letters are upper case */
int     chars;          /* number of characters */

   /* Initialize everything */

   word = word0;
   if (word[0] == 0) {
       return (ERROR);
   }
   flags = chars = ncaps = 0;
   allcaps = alldots = TRUE;
   dot = FALSE;

   /* Scan the word and test the type of the word. */

   while (cc = ctoi(*word++)) {
       ++chars;
       if (cc == E_LRSEP) {
           return (ERROR);
       }
       if (!scvalid(cc) || cc > 0x80) {	/* high-ascii hack */
           flags = IW_SPECIAL;
           alldots = FALSE;
           continue;
       }
       if (isupper(cc)) {
           ++ncaps;
       }
       if (islower(cc)) {
           allcaps = FALSE;
       }
       if (chars & 1) {
                        /* This is a character in an even position; if it is
                           a dot then this word is not an ALLDOT word.  */

           if (cc == '.') {
               alldots = FALSE;
               if (*word) {
                   dot = TRUE;
               }
           }
       } else {
                        /* This character is in an odd position; if it is
                           not a dot then this word is not ALLDOT.  */

           if (cc != '.') {
               alldots = FALSE;
           } else if (*word) {
               dot = TRUE;
           }
       }
   }
        /* Return with error if the there are too many input characters.  */

   if (chars >= inlen) {
       return (ERROR);
   }
   /* Assign those flags which depend on the dots in the word. */

   word -= 2;
   if (dot && alldots && *word == '.') {
       flags = IW_ALLDOT;
   } else {
       if (dot) {
           flags = IW_SPECIAL;
       }
       if (*word == '.') {
           flags |= IW_ENDDOT;
       }
   }
        /* Assign those flags which depend on the capitalization of the
           word.  */

   if (allcaps && ncaps) {
       flags |= IW_ACRONYM;
   } else if (isupper(ctoi(*word0)) && ncaps == 1) {
       flags |= IW_PROPER;
   } else if (ncaps == 0) {
       flags |= IW_COMMON;
   } else {
       flags |= IW_SPECIAL;
   }
   /* Copy the word to the output buffer. Translate upper case to
      lower case. Remove dots unless the word is a special. */

   word = word0;
   while (cc = tolower(ctoi(*word)), *word++) {
       if (cc != '.' || flags & IW_SPECIAL) {
           *outword++ = cc;
       }
   }
        /* Generate the remaining part of the output word for specials.
           This consists of E_LRSEP followed by the input word.  If the
           input word has a trailing dot then strip this from both parts of
           the special and set the return flag to indicate the trailing
           dot.  */

   if (flags & IW_SPECIAL) {

       flags &= IW_SPECIAL | IW_ENDDOT;
       if (flags & IW_ENDDOT) {
           --outword;
       }
       *outword++ = E_LRSEP;

       word = word0;
       while (*outword++ = *word++)
           ;
       --outword;
       if (flags & IW_ENDDOT) {
           --outword;
       }
   }
   /* Terminate the output string and return the word's flags */

   *outword = 0;

   return (flags);
}

/* 'Undoflags' reconstitutes a word from the string and the flag byte
   produced by 'doflags'.  It does this as expected for all types other than
   specials.  For specials, it assumes that the word presented to it has had
   the text up to and including the E_LRSEP character removed, and so it
   just copies the string and adds the trailing dot for IW_ENDDOT words.  */

VOID
undoflags(word, flags, outword)
 char *word;           /* The word to convert */
 int flags;             /* The flags for the word */
 char *outword;        /* The output word */
{
	 int c;

        /* For specials, copy the entire string and then add a terminal dot,
	   if required.  */

	if (flags & IW_SPECIAL) {
		while (*outword++ = *word++)
			;
		if (flags & IW_ENDDOT) {
			*(outword - 1) = '.';
			*outword = 0;
		}
		return;
	}
        /* Re-create the word, if it is not a special.  The first thing to
           do is to make the initial letter a capital if it was originally.
           Then add the dot after this letter if the word is an alldot
           word.  */

	if ((flags & IW_CASE) == IW_PROPER) {
		c = ctoi(*word++);
		*outword++ = _toupper(c);

		/* restore a dot if IW_ALLDOT */

		if (flags & IW_ALLDOT) {
			*outword++ = '.';
		}
	}
        /* The next thing to do is to copy the remaining characters to the
           output string, processing as required.  If the flag has
           IW_ACRONYM set then make all letters upper case.  If the flag has
           IW_ALLDOT set then append a dot to every letter.  After the word
           has been copied, if the flag has IW_ENDDOT set then add a dot to
           the word.  Note that this assumes that IW_ALLDOT and IW_ALLDOT
	   are not both set.  If they are then this will break.  */

	while (c = ctoi(*word++)) {
		if (islower(c) && (flags & IW_CASE) == IW_ACRONYM) {
			c = _toupper(c);
		}
		*outword++ = c;

		if (flags & IW_ALLDOT) {
			*outword++ = '.';
		}
	}
	if (flags & IW_ENDDOT) {
		*outword++ = '.';
	}
	*outword = 0;
}