2023-03-04 03:45:20 +01:00

1 line
5.5 KiB
C
Executable File

/***********************************************************************\
Filename: correct.c
\***********************************************************************/
#include <cschar.h>
#include "query.h"
#include "environ.h"
#include "correct.h"
#include "string.h"
/*#include "ctype.h"*/
#define TYPOTHRESH 2 /* number of typo candidates to keep */
#define TYPOLOW 0xDA00 /* simil. values of for typos below THRESH */
static char *Altlist[MAXLISTALTS + 1]; /* Pointers to the words; returned. */
/* Set the query for correction. */
setcorr(query)
char *query;
{
/* Put the query into the SC globals and check for errors. */
if (!setquery(query)) {
Altlist[0] = NULL;
Scerror |= ERR_CORRECT;
return (FALSE);
}
/* remove the right half of the word, if it is a special. */
if (query = strchr(Qulookup, E_LRSEP)) {
*query = 0;
}
Sclen = Qulen = strecpy(Sclookup, Qulookup) - Sclookup;
/* printf("Qulen = %d\n", Qulen);*/
/* Do language specific initialization. */
Corankf = 0;
return (TRUE);
}
/* Initialize the correction globals. */
VOID
corrinit()
{
Sclen = strlen((char *)Sclookup);
Corcount = 0;
Altlist[0] = NULL;
Coparam = Sclen < E_SHORT ? Engphspar : Engphpar;
Cominrank = Coqthresh = Sclen < E_SHORT ? 0x8000 : E_THRESH;
}
/* Use undoflags() to restore alternatives. Look at query flags, and adjust
correction alternatives accordingly (proper, acronym, enddot). */
char **
corrfinish()
{
extern VOID undoflags();
ALTINFO *alt; /* The corection alternate */
char *ptr;
char *wptr;
int i;
int n;
int maxlen;
int newflag; /* Modification to output based on input. */
char **list; /* The output list pointer */
char word[MAXWORD];
/* Restore dots and caps to alternatives. Add capitalization and
endot based on input word (newflag). Special case if the query
word is only one character: mask ACRONYM to proper if present.
*/
/*printf("in corrfinish...\n");*/
if (Qulen == 1) {
newflag = Quflags & IW_PROPER;
} else {
newflag = Quflags & IW_CASE;
}
/* Skip prefix, if present. Since Qulookup is not used again, just
over-write it, fix Qulen also */
ptr = wptr = Qulookup;
while (*wptr++ = tolower(*ptr), *++ptr)
;
/* Find the item in the ranked list with the largest common
subsequence when compared against the query. */
/*printf("Qulen = %d\n", Qulen);*/
maxlen = 0;
for (i = 0; i < Corcount; ++i)
{
alt = &Coranklist[i];
/* Convert alternate to lower-case before
seqlen. */
wptr = word;
ptr = alt->al_word;
while (*wptr++ = tolower(*ptr), *++ptr)
;
alt->al_simil = n = seqlen(word, Qulookup,
strlen((char *)word), Qulen);
if (n > maxlen) {
maxlen = n;
}
}
maxlen -= QDIFTHRESH;
/* Construct the correction list. */
list = Altlist;
for (i = 0; i < Corcount; ++i)
{
/* Get the correction candidate. */
alt = &Coranklist[Coorder[i]];
/* Make sure there are enough common letters. Ignore items
sufficiently shorter than the best. */
/* n = alt->al_simil;
if (n == 0 || n < Qulen - QLENTHRESH)
continue;
if (n < maxlen)
continue; */
/* Unflag the word. */
undoflags(alt->al_word, ctoi(alt->al_flags), word);
strecpy(alt->al_word, word);
/* Modify the word so that, if the input word was proper,
acronym, or enddot the output is also. */
if (*(strchr(alt->al_word, 0) - 1) != '.')
undoflags(alt->al_word, newflag | (Quflags & IW_ENDDOT),
alt->al_word);
else
undoflags(alt->al_word, newflag, alt->al_word);
/* Put the candidate into the correction list. */
*list++ = alt->al_word;
}
/* Terminate the correction list and return its start. */
*list++ = NULL;
return (Altlist);
}
/* This routine calculates the longest common subsequence between two
strings. Return TRUE if within the threshold set. Algorithm taken from:
Hunt, J. W. and T. G. Szymanski [1977]. "A fast algorithm for
computing longest common subsequences," Communications of the ACM 20:5,
350-353. */
seqlen(pa0, pb0, lena, lenb)
char *pa0;
char *pb0;
int lena;
int lenb;
{
char *tp;
int j;
char *pb = pb0;
char *pa = pa0;
char th[MAXWORD];
/*printf("in seqlen, lena = %d, lenb = %d\n", lena, lenb);*/
j = (lena < lenb ? lena : lenb) + 1;
tp = th;
while (--j >= 0)
*tp++ = 127;
while (--lena >= 0)
{
pb += j = lenb;
while (--j >= 0)
{
if (*pa != *--pb)
continue;
tp = th;
while (*tp++ < j)
;
*(tp-1) = j;
}
++pa;
}
tp = th;
while (*tp++ < 127)
;
return (tp - (th + 1));
}
/* User interface to the whole correct system. */
char **
correct(query)
char *query;
{
int i;
if (!setcorr(query))
return (Altlist);
corrinit();
clxtypo(Sclookup);
/* for all typo candidates below TYPOTHRESH, change the
pseudo-similarity to TYPOLOW so that phonetic corrections are
accommodated above this */
for (i = TYPOTHRESH; i < Corcount; ++i)
Coranklist[Coorder[i]].al_simil = TYPOLOW - i;
if (cordophon())
return (Altlist);
return (corrfinish());
}