Better spelling suggestions, using a secondary algorithm as a tie-breaker to favor the most relevant suggestions

This commit is contained in:
Brendan Robert 2015-09-12 23:22:58 -05:00
parent 287ceca85f
commit 0ca78ec5f8
3 changed files with 39 additions and 18 deletions

View File

@ -143,7 +143,7 @@ public class DataUtilities {
}
}
}
return Math.max(m, n) - dist[m][n];
return dist[m][n];
}
/**
@ -155,7 +155,7 @@ public class DataUtilities {
* @param c1
* @param c2
* @param width Search window size
* @return Overall similarity score (higher is beter)
* @return Overall similarity score (higher is better)
*/
public static double rankMatch(String c1, String c2, int width) {
double score = 0;

View File

@ -28,7 +28,6 @@ import org.badvision.outlaweditor.data.DataUtilities;
*/
public class SpellChecker {
private static HashMap<Character, Set<String>> dictionary;
private final double SIMILARITY_THRESHOLD = 0.5;
public SpellChecker() {
loadDictionary();
@ -41,15 +40,10 @@ public class SpellChecker {
for (String word : words) {
Set<Suggestion> suggestions = getSuggestions(word);
if (suggestions != null && !suggestions.isEmpty()) {
Suggestion first = suggestions.stream().findFirst().get();
if (first.similarity == 1.0) {
continue;
} else {
SpellResponse.Source source = new SpellResponse.Source();
source.start = pos;
source.word = word;
response.corrections.put(source, suggestions);
}
SpellResponse.Source source = new SpellResponse.Source();
source.start = pos;
source.word = word;
response.corrections.put(source, suggestions);
}
pos += word.length() + 1;
@ -86,16 +80,18 @@ public class SpellChecker {
String lower = word.toLowerCase();
Character first = lower.charAt(0);
Set<String> words = dictionary.get(first);
int length = lower.length();
double threshold = length <= 2 ? 0 : Math.log(length-1) * 1.75;
if (words != null) {
if (words.contains(lower)) {
if (lower.length() <= 2 || words.contains(lower)) {
return null;
}
words.parallelStream().forEach((String dictWord) -> {
int distance = DataUtilities.levenshteinDistance(lower, dictWord);
double similarity = distance / ((double) Math.max(lower.length(), dictWord.length()));
if (similarity >= SIMILARITY_THRESHOLD) {
if (distance <= threshold) {
Suggestion suggestion = new Suggestion();
suggestion.similarity = similarity;
suggestion.original = lower;
suggestion.similarity = distance;
suggestion.word = dictWord;
suggestions.add(suggestion);
}

View File

@ -9,18 +9,43 @@
*/
package org.badvision.outlaweditor.spelling;
import static org.badvision.outlaweditor.data.DataUtilities.rankMatch;
public class Suggestion implements Comparable<Suggestion> {
public String original;
public String word;
public double similarity;
public double similarity;
private double similarityRank = -1;
public String getWord() {
return word;
}
public double getSimilarity() {
return similarity;
}
@Override
public int compareTo(Suggestion o) {
return (int) Math.signum(o.similarity - similarity);
if (similarity == o.similarity) {
double rank1 = getSimilarityRank();
double rank2 = o.getSimilarityRank();
if (rank1 == rank2) {
return (word.compareTo(o.word));
} else {
// Normalize result to -1, 0 or 1 so there is no rounding issues!
return (int) Math.signum(rank2 - rank1);
}
}
return (int) Math.signum(similarity - o.similarity);
}
private double getSimilarityRank() {
if (similarityRank < 0) {
similarityRank = rankMatch(word, original, 3) + rankMatch(word, original, 2);
}
return similarityRank;
}
}