public static void TestQGram() { string fn1 = "acht"; string fn2 = "yacht"; string fn3 = "acha"; NGramDistance ngd = new NGramDistance(); double x = ngd.GetDistance(fn1, fn2); double y = ngd.GetDistance(fn1, fn3); double z = x; }
public void TestEmpty() { StringDistance nsd = new NGramDistance(1); float d = nsd.GetDistance("", "al"); Assert.AreEqual(d, 0.0f, 0.001); }
public string GetBestMatchWord(string OriginalWord) { EnsureIndexed(); var existing = indexReader.DocFreq(new Term("word", OriginalWord)); if (existing > 0) { return(OriginalWord); } var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, 10, null, "word", true); var jaro = new JaroWinklerDistance(); var leven = new LevenshteinDistance(); var ngram = new NGramDistance(); var metrics = suggestions.Select(s => new { word = s, freq = indexReader.DocFreq(new Term("word", s)), jaro = jaro.GetDistance(OriginalWord, s), leven = leven.GetDistance(OriginalWord, s), ngram = ngram.GetDistance(OriginalWord, s) }) .OrderByDescending(metric => ( (metric.freq / 100f) + metric.jaro + metric.leven + metric.ngram ) / 4f ) .ToList(); return(metrics.Select(m => m.word).FirstOrDefault()); }
public AlternateWordList GetAlternateWordList(string OriginalWord, int NumberToReturn) { var wordList = new AlternateWordList(); wordList.OriginalWord = OriginalWord; EnsureIndexed(); var existing = indexReader.DocFreq(new Term("word", OriginalWord)); wordList.OriginalWordFrequency = existing; var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, NumberToReturn, null, "word", true); var jaro = new JaroWinklerDistance(); var leven = new LevenshteinDistance(); var ngram = new NGramDistance(); var metrics = suggestions.Select(s => new { word = s, freq = indexReader.DocFreq(new Term("word", s)), jaro = jaro.GetDistance(OriginalWord, s), leven = leven.GetDistance(OriginalWord, s), ngram = ngram.GetDistance(OriginalWord, s) }) .OrderByDescending(metric => ( (metric.freq / 100f) + metric.jaro + metric.leven + metric.ngram ) / 4f ) .ToList(); var list = new List <AlternateWord>(); var sortOrder = 1; foreach (var item in metrics) { var altWord = new AlternateWord(); altWord.Word = item.word; altWord.Frequency = item.freq; altWord.JaroWinkler = item.jaro; altWord.Levenshtein = item.leven; altWord.NGram = item.ngram; altWord.BestMatchScore = ((item.freq / 100f) + item.jaro + item.leven + item.ngram) / 4f; altWord.BestMatchSortOrder = sortOrder; list.Add(altWord); sortOrder++; } wordList.Words = list; return(wordList); }
public List <string> GetTopSuggestions(string value, int numberOfItems) { EnsureIndexed(); var suggestionCollection = new List <string>(); var existing = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, value)); if (existing > 0)// the fist one will be correct of exist { suggestionCollection.Add(value); } var suggestions = _checker.SuggestSimilar(value, numberOfItems, null, SpellCheckerConstants.SpellCheckerKey, true); var jaro = new JaroWinklerDistance(); var leven = new LevenshteinDistance(); var ngram = new NGramDistance(); var metrics = suggestions.Select(s => new { word = s, freq = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, s)), jaro = jaro.GetDistance(value, s), leven = leven.GetDistance(value, s), ngram = ngram.GetDistance(value, s) }) .OrderByDescending(metric => metric.jaro) .ThenByDescending(m => m.ngram) .ThenByDescending(metric => ( metric.freq / 100f + metric.leven ) / 2f ) .ToList(); var wordsOnly = metrics.Select(m => m.word).ToList(); suggestionCollection.AddRange(wordsOnly); return(suggestionCollection); }
public void TestGetDistance2() { StringDistance sd = new NGramDistance(2); float d = sd.GetDistance("al", "al"); Assert.AreEqual(d, 1.0f, 0.001); d = sd.GetDistance("a", "a"); Assert.AreEqual(d, 1.0f, 0.001); d = sd.GetDistance("b", "a"); Assert.AreEqual(d, 0.0f, 0.001); d = sd.GetDistance("a", "aa"); Assert.AreEqual(d, 0.5f, 0.001); d = sd.GetDistance("martha", "marhta"); Assert.AreEqual(d, 0.6666, 0.001); d = sd.GetDistance("jones", "johnson"); Assert.AreEqual(d, 0.4285, 0.001); d = sd.GetDistance("natural", "contrary"); Assert.AreEqual(d, 0.25, 0.001); d = sd.GetDistance("abcvwxyz", "cabvwxyz"); Assert.AreEqual(d, 0.625, 0.001); d = sd.GetDistance("dwayne", "duane"); Assert.AreEqual(d, 0.5833, 0.001); d = sd.GetDistance("dixon", "dicksonx"); Assert.AreEqual(d, 0.5, 0.001); d = sd.GetDistance("six", "ten"); Assert.AreEqual(d, 0, 0.001); float d1 = sd.GetDistance("zac ephron", "zac efron"); float d2 = sd.GetDistance("zac ephron", "kai ephron"); Assert.IsTrue(d1 > d2); d1 = sd.GetDistance("brittney spears", "britney spears"); d2 = sd.GetDistance("brittney spears", "brittney startzman"); Assert.IsTrue(d1 > d2); d1 = sd.GetDistance("0012345678", "0012890678"); d2 = sd.GetDistance("0012345678", "0072385698"); Assert.AreEqual(d1, d2, 0.001); }