Better spelling suggestions, using a secondary algorithm as a tie-breaker to favor the most relevant suggestions

This commit is contained in:
Brendan Robert 2015-09-12 23:22:58 -05:00
parent 287ceca85f
commit 0ca78ec5f8
3 changed files with 39 additions and 18 deletions

View File

@ -143,7 +143,7 @@ public class DataUtilities {
} }
} }
} }
return Math.max(m, n) - dist[m][n]; return dist[m][n];
} }
/** /**
@ -155,7 +155,7 @@ public class DataUtilities {
* @param c1 * @param c1
* @param c2 * @param c2
* @param width Search window size * @param width Search window size
* @return Overall similarity score (higher is beter) * @return Overall similarity score (higher is better)
*/ */
public static double rankMatch(String c1, String c2, int width) { public static double rankMatch(String c1, String c2, int width) {
double score = 0; double score = 0;

View File

@ -28,7 +28,6 @@ import org.badvision.outlaweditor.data.DataUtilities;
*/ */
public class SpellChecker { public class SpellChecker {
private static HashMap<Character, Set<String>> dictionary; private static HashMap<Character, Set<String>> dictionary;
private final double SIMILARITY_THRESHOLD = 0.5;
public SpellChecker() { public SpellChecker() {
loadDictionary(); loadDictionary();
@ -41,15 +40,10 @@ public class SpellChecker {
for (String word : words) { for (String word : words) {
Set<Suggestion> suggestions = getSuggestions(word); Set<Suggestion> suggestions = getSuggestions(word);
if (suggestions != null && !suggestions.isEmpty()) { if (suggestions != null && !suggestions.isEmpty()) {
Suggestion first = suggestions.stream().findFirst().get(); SpellResponse.Source source = new SpellResponse.Source();
if (first.similarity == 1.0) { source.start = pos;
continue; source.word = word;
} else { response.corrections.put(source, suggestions);
SpellResponse.Source source = new SpellResponse.Source();
source.start = pos;
source.word = word;
response.corrections.put(source, suggestions);
}
} }
pos += word.length() + 1; pos += word.length() + 1;
@ -86,16 +80,18 @@ public class SpellChecker {
String lower = word.toLowerCase(); String lower = word.toLowerCase();
Character first = lower.charAt(0); Character first = lower.charAt(0);
Set<String> words = dictionary.get(first); Set<String> words = dictionary.get(first);
int length = lower.length();
double threshold = length <= 2 ? 0 : Math.log(length-1) * 1.75;
if (words != null) { if (words != null) {
if (words.contains(lower)) { if (lower.length() <= 2 || words.contains(lower)) {
return null; return null;
} }
words.parallelStream().forEach((String dictWord) -> { words.parallelStream().forEach((String dictWord) -> {
int distance = DataUtilities.levenshteinDistance(lower, dictWord); int distance = DataUtilities.levenshteinDistance(lower, dictWord);
double similarity = distance / ((double) Math.max(lower.length(), dictWord.length())); if (distance <= threshold) {
if (similarity >= SIMILARITY_THRESHOLD) {
Suggestion suggestion = new Suggestion(); Suggestion suggestion = new Suggestion();
suggestion.similarity = similarity; suggestion.original = lower;
suggestion.similarity = distance;
suggestion.word = dictWord; suggestion.word = dictWord;
suggestions.add(suggestion); suggestions.add(suggestion);
} }

View File

@ -9,18 +9,43 @@
*/ */
package org.badvision.outlaweditor.spelling; package org.badvision.outlaweditor.spelling;
import static org.badvision.outlaweditor.data.DataUtilities.rankMatch;
public class Suggestion implements Comparable<Suggestion> { public class Suggestion implements Comparable<Suggestion> {
public String original;
public String word; public String word;
public double similarity; public double similarity;
private double similarityRank = -1;
public String getWord() { public String getWord() {
return word; return word;
} }
public double getSimilarity() { public double getSimilarity() {
return similarity; return similarity;
} }
@Override @Override
public int compareTo(Suggestion o) { public int compareTo(Suggestion o) {
return (int) Math.signum(o.similarity - similarity); if (similarity == o.similarity) {
double rank1 = getSimilarityRank();
double rank2 = o.getSimilarityRank();
if (rank1 == rank2) {
return (word.compareTo(o.word));
} else {
// Normalize result to -1, 0 or 1 so there is no rounding issues!
return (int) Math.signum(rank2 - rank1);
}
}
return (int) Math.signum(similarity - o.similarity);
}
private double getSimilarityRank() {
if (similarityRank < 0) {
similarityRank = rankMatch(word, original, 3) + rankMatch(word, original, 2);
}
return similarityRank;
} }
} }