mirror of
https://github.com/antoinevignau/source.git
synced 2024-10-31 22:06:40 +00:00
1 line
5.5 KiB
C
Executable File
1 line
5.5 KiB
C
Executable File
/***********************************************************************\
|
|
|
|
Filename: correct.c
|
|
|
|
\***********************************************************************/
|
|
|
|
#include <cschar.h>
|
|
#include "query.h"
|
|
#include "environ.h"
|
|
#include "correct.h"
|
|
#include "string.h"
|
|
/*#include "ctype.h"*/
|
|
|
|
#define TYPOTHRESH 2 /* number of typo candidates to keep */
|
|
#define TYPOLOW 0xDA00 /* simil. values of for typos below THRESH */
|
|
|
|
static char *Altlist[MAXLISTALTS + 1]; /* Pointers to the words; returned. */
|
|
|
|
/* Set the query for correction. */
|
|
|
|
setcorr(query)
|
|
char *query;
|
|
{
|
|
/* Put the query into the SC globals and check for errors. */
|
|
|
|
if (!setquery(query)) {
|
|
Altlist[0] = NULL;
|
|
Scerror |= ERR_CORRECT;
|
|
return (FALSE);
|
|
}
|
|
/* remove the right half of the word, if it is a special. */
|
|
|
|
if (query = strchr(Qulookup, E_LRSEP)) {
|
|
*query = 0;
|
|
}
|
|
Sclen = Qulen = strecpy(Sclookup, Qulookup) - Sclookup;
|
|
|
|
/* printf("Qulen = %d\n", Qulen);*/
|
|
|
|
/* Do language specific initialization. */
|
|
|
|
Corankf = 0;
|
|
return (TRUE);
|
|
}
|
|
|
|
/* Initialize the correction globals. */
|
|
|
|
VOID
|
|
corrinit()
|
|
{
|
|
Sclen = strlen((char *)Sclookup);
|
|
Corcount = 0;
|
|
Altlist[0] = NULL;
|
|
Coparam = Sclen < E_SHORT ? Engphspar : Engphpar;
|
|
Cominrank = Coqthresh = Sclen < E_SHORT ? 0x8000 : E_THRESH;
|
|
}
|
|
|
|
/* Use undoflags() to restore alternatives. Look at query flags, and adjust
|
|
correction alternatives accordingly (proper, acronym, enddot). */
|
|
|
|
char **
|
|
corrfinish()
|
|
{
|
|
extern VOID undoflags();
|
|
|
|
ALTINFO *alt; /* The corection alternate */
|
|
char *ptr;
|
|
char *wptr;
|
|
int i;
|
|
int n;
|
|
int maxlen;
|
|
int newflag; /* Modification to output based on input. */
|
|
char **list; /* The output list pointer */
|
|
char word[MAXWORD];
|
|
|
|
/* Restore dots and caps to alternatives. Add capitalization and
|
|
endot based on input word (newflag). Special case if the query
|
|
word is only one character: mask ACRONYM to proper if present.
|
|
*/
|
|
|
|
/*printf("in corrfinish...\n");*/
|
|
|
|
if (Qulen == 1) {
|
|
newflag = Quflags & IW_PROPER;
|
|
} else {
|
|
newflag = Quflags & IW_CASE;
|
|
}
|
|
/* Skip prefix, if present. Since Qulookup is not used again, just
|
|
over-write it, fix Qulen also */
|
|
|
|
ptr = wptr = Qulookup;
|
|
while (*wptr++ = tolower(*ptr), *++ptr)
|
|
;
|
|
|
|
/* Find the item in the ranked list with the largest common
|
|
subsequence when compared against the query. */
|
|
|
|
/*printf("Qulen = %d\n", Qulen);*/
|
|
|
|
maxlen = 0;
|
|
for (i = 0; i < Corcount; ++i)
|
|
{
|
|
alt = &Coranklist[i];
|
|
|
|
/* Convert alternate to lower-case before
|
|
seqlen. */
|
|
|
|
wptr = word;
|
|
ptr = alt->al_word;
|
|
|
|
while (*wptr++ = tolower(*ptr), *++ptr)
|
|
;
|
|
alt->al_simil = n = seqlen(word, Qulookup,
|
|
strlen((char *)word), Qulen);
|
|
if (n > maxlen) {
|
|
maxlen = n;
|
|
}
|
|
}
|
|
maxlen -= QDIFTHRESH;
|
|
|
|
/* Construct the correction list. */
|
|
|
|
list = Altlist;
|
|
for (i = 0; i < Corcount; ++i)
|
|
{
|
|
/* Get the correction candidate. */
|
|
|
|
alt = &Coranklist[Coorder[i]];
|
|
|
|
/* Make sure there are enough common letters. Ignore items
|
|
sufficiently shorter than the best. */
|
|
|
|
/* n = alt->al_simil;
|
|
if (n == 0 || n < Qulen - QLENTHRESH)
|
|
continue;
|
|
if (n < maxlen)
|
|
continue; */
|
|
|
|
/* Unflag the word. */
|
|
|
|
undoflags(alt->al_word, ctoi(alt->al_flags), word);
|
|
|
|
strecpy(alt->al_word, word);
|
|
|
|
/* Modify the word so that, if the input word was proper,
|
|
acronym, or enddot the output is also. */
|
|
|
|
if (*(strchr(alt->al_word, 0) - 1) != '.')
|
|
undoflags(alt->al_word, newflag | (Quflags & IW_ENDDOT),
|
|
alt->al_word);
|
|
else
|
|
undoflags(alt->al_word, newflag, alt->al_word);
|
|
|
|
/* Put the candidate into the correction list. */
|
|
|
|
*list++ = alt->al_word;
|
|
}
|
|
|
|
/* Terminate the correction list and return its start. */
|
|
|
|
*list++ = NULL;
|
|
return (Altlist);
|
|
}
|
|
|
|
/* This routine calculates the longest common subsequence between two
|
|
strings. Return TRUE if within the threshold set. Algorithm taken from:
|
|
Hunt, J. W. and T. G. Szymanski [1977]. "A fast algorithm for
|
|
computing longest common subsequences," Communications of the ACM 20:5,
|
|
350-353. */
|
|
|
|
seqlen(pa0, pb0, lena, lenb)
|
|
char *pa0;
|
|
char *pb0;
|
|
int lena;
|
|
int lenb;
|
|
{
|
|
char *tp;
|
|
int j;
|
|
char *pb = pb0;
|
|
char *pa = pa0;
|
|
char th[MAXWORD];
|
|
|
|
/*printf("in seqlen, lena = %d, lenb = %d\n", lena, lenb);*/
|
|
|
|
j = (lena < lenb ? lena : lenb) + 1;
|
|
tp = th;
|
|
while (--j >= 0)
|
|
*tp++ = 127;
|
|
while (--lena >= 0)
|
|
{
|
|
pb += j = lenb;
|
|
while (--j >= 0)
|
|
{
|
|
if (*pa != *--pb)
|
|
continue;
|
|
tp = th;
|
|
while (*tp++ < j)
|
|
;
|
|
*(tp-1) = j;
|
|
}
|
|
++pa;
|
|
}
|
|
tp = th;
|
|
while (*tp++ < 127)
|
|
;
|
|
return (tp - (th + 1));
|
|
}
|
|
|
|
/* User interface to the whole correct system. */
|
|
|
|
char **
|
|
correct(query)
|
|
char *query;
|
|
{
|
|
int i;
|
|
|
|
if (!setcorr(query))
|
|
return (Altlist);
|
|
corrinit();
|
|
clxtypo(Sclookup);
|
|
|
|
/* for all typo candidates below TYPOTHRESH, change the
|
|
pseudo-similarity to TYPOLOW so that phonetic corrections are
|
|
accommodated above this */
|
|
|
|
for (i = TYPOTHRESH; i < Corcount; ++i)
|
|
Coranklist[Coorder[i]].al_simil = TYPOLOW - i;
|
|
if (cordophon())
|
|
return (Altlist);
|
|
return (corrfinish());
|
|
}
|