antoine-source/appleworksgs/Spell/Src/CORRECT.C

/***********************************************************************\

    Filename: correct.c

\***********************************************************************/

#include <cschar.h>
#include "query.h"
#include "environ.h"
#include "correct.h"
#include "string.h"
/*#include "ctype.h"*/

#define TYPOTHRESH      2       /* number of typo candidates to keep */
#define TYPOLOW         0xDA00  /* simil. values of for typos below THRESH */

static char *Altlist[MAXLISTALTS + 1];   /* Pointers to the words; returned. */

/* Set the query for correction. */

setcorr(query)
char   *query;
{
    /* Put the query into the SC globals and check for errors. */

    if (!setquery(query)) {
        Altlist[0] = NULL;
        Scerror |= ERR_CORRECT;
        return (FALSE);
    }
    /* remove the right half of the word, if it is a special. */

    if (query = strchr(Qulookup, E_LRSEP)) {
        *query = 0;
    }
    Sclen = Qulen = strecpy(Sclookup, Qulookup) - Sclookup;

/*    printf("Qulen = %d\n", Qulen);*/

    /* Do language specific initialization. */

    Corankf = 0;
    return (TRUE);
}

/* Initialize the correction globals. */

VOID
corrinit()
{
    Sclen = strlen((char *)Sclookup);
    Corcount = 0;
    Altlist[0] = NULL;
    Coparam = Sclen < E_SHORT ? Engphspar : Engphpar;
    Cominrank = Coqthresh = Sclen < E_SHORT ? 0x8000 : E_THRESH;
}

/* Use undoflags() to restore alternatives.  Look at query flags, and adjust
   correction alternatives accordingly (proper, acronym, enddot).  */

char **
corrfinish()
{
extern VOID undoflags();

ALTINFO *alt;  /* The corection alternate */
char *ptr;
char *wptr;
int i;
int n;
int     maxlen;
int     newflag;        /* Modification to output based on input. */
char   **list;         /* The output list pointer */
char   word[MAXWORD];

        /* Restore dots and caps to alternatives.  Add capitalization and
           endot based on input word (newflag).  Special case if the query
           word is only one character: mask ACRONYM to proper if present.
           */

           /*printf("in corrfinish...\n");*/

   if (Qulen == 1) {
       newflag = Quflags & IW_PROPER;
   } else {
       newflag = Quflags & IW_CASE;
   }
        /* Skip prefix, if present.  Since Qulookup is not used again, just
           over-write it, fix Qulen also */

   ptr = wptr = Qulookup;
   while (*wptr++ = tolower(*ptr), *++ptr)
       ;

        /* Find the item in the ranked list with the largest common
      subsequence when compared against the query. */

    /*printf("Qulen = %d\n", Qulen);*/

   maxlen = 0;
   for (i = 0; i < Corcount; ++i)
   {
       alt = &Coranklist[i];

                /* Convert alternate to lower-case before
          seqlen.  */

       wptr = word;
       ptr = alt->al_word;

       while (*wptr++ = tolower(*ptr), *++ptr)
           ;
       alt->al_simil = n = seqlen(word, Qulookup,
           strlen((char *)word), Qulen);
       if (n > maxlen) {
           maxlen = n;
       }
   }
   maxlen -= QDIFTHRESH;

   /* Construct the correction list. */

   list = Altlist;
   for (i = 0; i < Corcount; ++i)
   {
       /* Get the correction candidate. */

       alt = &Coranklist[Coorder[i]];

       /* Make sure there are enough common letters.  Ignore items
          sufficiently shorter than the best.  */

/*       n = alt->al_simil;
       if (n == 0 || n < Qulen - QLENTHRESH)
           continue;
       if (n < maxlen)
           continue;        */

       /* Unflag the word. */

       undoflags(alt->al_word, ctoi(alt->al_flags), word);

       strecpy(alt->al_word, word);

       /* Modify the word so that, if the input word was proper,
          acronym, or enddot the output is also.  */

       if (*(strchr(alt->al_word, 0) - 1) != '.')
           undoflags(alt->al_word,  newflag | (Quflags & IW_ENDDOT),
             alt->al_word);
       else
           undoflags(alt->al_word, newflag, alt->al_word);

       /* Put the candidate into the correction list. */

       *list++ = alt->al_word;
   }

   /* Terminate the correction list and return its start. */

   *list++ = NULL;
   return (Altlist);
}

/* This routine calculates the longest common subsequence between two
   strings.  Return TRUE if within the threshold set.  Algorithm taken from:
   Hunt, J.  W.  and T.  G.  Szymanski [1977].  "A fast algorithm for
   computing longest common subsequences," Communications of the ACM 20:5,
   350-353.  */

seqlen(pa0, pb0, lena, lenb)
char   *pa0;
char   *pb0;
int     lena;
int     lenb;
{
char *tp;
int j;
char *pb = pb0;
char *pa = pa0;
char    th[MAXWORD];

/*printf("in seqlen, lena = %d, lenb = %d\n", lena, lenb);*/

   j = (lena < lenb ? lena : lenb) + 1;
   tp = th;
   while (--j >= 0)
       *tp++ = 127;
   while (--lena >= 0)
   {
       pb += j = lenb;
       while (--j >= 0)
       {
           if (*pa != *--pb)
               continue;
           tp = th;
           while (*tp++ < j)
               ;
           *(tp-1) = j;
       }
       ++pa;
   }
   tp = th;
   while (*tp++ < 127)
       ;
   return (tp - (th + 1));
}

/* User interface to the whole correct system. */

char **
correct(query)
char   *query;
{
int i;

   if (!setcorr(query))
       return (Altlist);
   corrinit();
   clxtypo(Sclookup);

   /* for all typo candidates below TYPOTHRESH, change the
      pseudo-similarity to TYPOLOW so that phonetic corrections are
      accommodated above this */

   for (i = TYPOTHRESH; i < Corcount; ++i)
       Coranklist[Coorder[i]].al_simil = TYPOLOW - i;
   if (cordophon())
       return (Altlist);
   return (corrfinish());
}