public void TestGetDistance1() { StringDistance nsd = new NGramDistance(1); float d = nsd.GetDistance("al", "al"); Assert.AreEqual(d, 1.0f, 0.001); d = nsd.GetDistance("a", "a"); Assert.AreEqual(d, 1.0f, 0.001); d = nsd.GetDistance("b", "a"); Assert.AreEqual(d, 0.0f, 0.001); d = nsd.GetDistance("martha", "marhta"); Assert.AreEqual(d, 0.6666, 0.001); d = nsd.GetDistance("jones", "johnson"); Assert.AreEqual(d, 0.4285, 0.001); d = nsd.GetDistance("natural", "contrary"); Assert.AreEqual(d, 0.25, 0.001); d = nsd.GetDistance("abcvwxyz", "cabvwxyz"); Assert.AreEqual(d, 0.75, 0.001); d = nsd.GetDistance("dwayne", "duane"); Assert.AreEqual(d, 0.666, 0.001); d = nsd.GetDistance("dixon", "dicksonx"); Assert.AreEqual(d, 0.5, 0.001); d = nsd.GetDistance("six", "ten"); Assert.AreEqual(d, 0, 0.001); float d1 = nsd.GetDistance("zac ephron", "zac efron"); float d2 = nsd.GetDistance("zac ephron", "kai ephron"); Assert.AreEqual(d1, d2, 0.001); d1 = nsd.GetDistance("brittney spears", "britney spears"); d2 = nsd.GetDistance("brittney spears", "brittney startzman"); Assert.IsTrue(d1 > d2); d1 = nsd.GetDistance("12345678", "12890678"); d2 = nsd.GetDistance("12345678", "72385698"); Assert.AreEqual(d1, d2, 001); }
public void TestGetDistance3() { StringDistance sd = new NGramDistance(3); float d = sd.GetDistance("al", "al"); Assert.AreEqual(d, 1.0f, 0.001); d = sd.GetDistance("a", "a"); Assert.AreEqual(d, 1.0f, 0.001); d = sd.GetDistance("b", "a"); Assert.AreEqual(d, 0.0f, 0.001); d = sd.GetDistance("martha", "marhta"); Assert.AreEqual(d, 0.7222, 0.001); d = sd.GetDistance("jones", "johnson"); Assert.AreEqual(d, 0.4762, 0.001); d = sd.GetDistance("natural", "contrary"); Assert.AreEqual(d, 0.2083, 0.001); d = sd.GetDistance("abcvwxyz", "cabvwxyz"); Assert.AreEqual(d, 0.5625, 0.001); d = sd.GetDistance("dwayne", "duane"); Assert.AreEqual(d, 0.5277, 0.001); d = sd.GetDistance("dixon", "dicksonx"); Assert.AreEqual(d, 0.4583, 0.001); d = sd.GetDistance("six", "ten"); Assert.AreEqual(d, 0, 0.001); float d1 = sd.GetDistance("zac ephron", "zac efron"); float d2 = sd.GetDistance("zac ephron", "kai ephron"); Assert.IsTrue(d1 > d2); d1 = sd.GetDistance("brittney spears", "britney spears"); d2 = sd.GetDistance("brittney spears", "brittney startzman"); Assert.IsTrue(d1 > d2); d1 = sd.GetDistance("0012345678", "0012890678"); d2 = sd.GetDistance("0012345678", "0072385698"); Assert.IsTrue(d1 < d2); }
public void TestEmpty() { StringDistance nsd = new NGramDistance(1); float d = nsd.GetDistance("", "al"); Assert.AreEqual(d, 0.0f, 0.001); }
public string GetBestMatchWord(string OriginalWord) { EnsureIndexed(); var existing = indexReader.DocFreq(new Term("word", OriginalWord)); if (existing > 0) { return(OriginalWord); } var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, 10, null, "word", true); var jaro = new JaroWinklerDistance(); var leven = new LevenshteinDistance(); var ngram = new NGramDistance(); var metrics = suggestions.Select(s => new { word = s, freq = indexReader.DocFreq(new Term("word", s)), jaro = jaro.GetDistance(OriginalWord, s), leven = leven.GetDistance(OriginalWord, s), ngram = ngram.GetDistance(OriginalWord, s) }) .OrderByDescending(metric => ( (metric.freq / 100f) + metric.jaro + metric.leven + metric.ngram ) / 4f ) .ToList(); return(metrics.Select(m => m.word).FirstOrDefault()); }
public AlternateWordList GetAlternateWordList(string OriginalWord, int NumberToReturn) { var wordList = new AlternateWordList(); wordList.OriginalWord = OriginalWord; EnsureIndexed(); var existing = indexReader.DocFreq(new Term("word", OriginalWord)); wordList.OriginalWordFrequency = existing; var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, NumberToReturn, null, "word", true); var jaro = new JaroWinklerDistance(); var leven = new LevenshteinDistance(); var ngram = new NGramDistance(); var metrics = suggestions.Select(s => new { word = s, freq = indexReader.DocFreq(new Term("word", s)), jaro = jaro.GetDistance(OriginalWord, s), leven = leven.GetDistance(OriginalWord, s), ngram = ngram.GetDistance(OriginalWord, s) }) .OrderByDescending(metric => ( (metric.freq / 100f) + metric.jaro + metric.leven + metric.ngram ) / 4f ) .ToList(); var list = new List <AlternateWord>(); var sortOrder = 1; foreach (var item in metrics) { var altWord = new AlternateWord(); altWord.Word = item.word; altWord.Frequency = item.freq; altWord.JaroWinkler = item.jaro; altWord.Levenshtein = item.leven; altWord.NGram = item.ngram; altWord.BestMatchScore = ((item.freq / 100f) + item.jaro + item.leven + item.ngram) / 4f; altWord.BestMatchSortOrder = sortOrder; list.Add(altWord); sortOrder++; } wordList.Words = list; return(wordList); }
public static void TestQGram() { string fn1 = "acht"; string fn2 = "yacht"; string fn3 = "acha"; NGramDistance ngd = new NGramDistance(); double x = ngd.GetDistance(fn1, fn2); double y = ngd.GetDistance(fn1, fn3); double z = x; }
public static double findSimilarDictionaryWord(string word, double maxSimilarity, int index, List <string> equalMinDistanceDictWordList) { index = index - _minWordLength; word = word.ToLower(); double NewSimilarity = 0; int WordLength = word.Length; if ((WordLength + index) < 0) { return(maxSimilarity); } if ((WordLength + index) >= _IndexDictionary.Length) { return(maxSimilarity); } if (_IndexDictionary[WordLength + index] == null) { return(maxSimilarity); } for (int j = 0; j < _IndexDictionary[WordLength + index].Count; j++) { JaroWinklerDistance JaroDist = new JaroWinklerDistance(); NGramDistance ng = new NGramDistance(); JaccardDistance jd = new JaccardDistance(); NewSimilarity = jd.GetDistance(word, _IndexDictionary[WordLength + index][j]);//(double)JaroDist.GetDistance(word, _IndexDictionary[WordLenght - 1 + index][j]); if (NewSimilarity > maxSimilarity) { equalMinDistanceDictWordList.Clear(); equalMinDistanceDictWordList.Add(_IndexDictionary[WordLength + index][j]); maxSimilarity = NewSimilarity; } else if (NewSimilarity == maxSimilarity) { equalMinDistanceDictWordList.Add(_IndexDictionary[WordLength + index][j]); } } return(maxSimilarity); }
public List <string> GetTopSuggestions(string value, int numberOfItems) { EnsureIndexed(); var suggestionCollection = new List <string>(); var existing = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, value)); if (existing > 0)// the fist one will be correct of exist { suggestionCollection.Add(value); } var suggestions = _checker.SuggestSimilar(value, numberOfItems, null, SpellCheckerConstants.SpellCheckerKey, true); var jaro = new JaroWinklerDistance(); var leven = new LevenshteinDistance(); var ngram = new NGramDistance(); var metrics = suggestions.Select(s => new { word = s, freq = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, s)), jaro = jaro.GetDistance(value, s), leven = leven.GetDistance(value, s), ngram = ngram.GetDistance(value, s) }) .OrderByDescending(metric => metric.jaro) .ThenByDescending(m => m.ngram) .ThenByDescending(metric => ( metric.freq / 100f + metric.leven ) / 2f ) .ToList(); var wordsOnly = metrics.Select(m => m.word).ToList(); suggestionCollection.AddRange(wordsOnly); return(suggestionCollection); }
private void StringCompareTest(string input, string[] testCases) { Debug.WriteLine("Dice Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer dice = new DiceCoefficent(); double diceValue = dice.Compare(input, name); Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("Jaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jaccard = new Jaccard(); double jaccardValue = jaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer exjaccard = new ExtendedJaccard(); double exjaccardValue = exjaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lev = new DamerauLevenshteinDistance(); var levenStein = lev.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", levenStein, name); } Debug.WriteLine(""); Debug.WriteLine("JaroWinkler for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jw = new JaroWinkler(); var jwValue = jw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", jwValue, name); } Debug.WriteLine(""); Debug.WriteLine("Monge-Elkan for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer me = new MongeElkan(); var meValue = me.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", meValue, name); } Debug.WriteLine(""); Debug.WriteLine("NGramDistance(2) for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer ngram2 = new NGramDistance(); (ngram2 as NGramDistance).NGramLength = 2; var ngramValue2 = ngram2.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", ngramValue2, name); } Debug.WriteLine(""); Debug.WriteLine("SmithWaterman for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer sw = new SmithWaterman(); var swValue = sw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", swValue, name); } Debug.WriteLine(""); Debug.WriteLine("Extended Editex for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer edx = new ExtendedEditex(); var edxValue = edx.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", edxValue, name); } Debug.WriteLine(""); Debug.WriteLine("Longest Common Subsequence for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lcs = new LongestCommonSubsequence(); var lcsValue = lcs.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); }
public static double findSimilarDictionaryWord(string word, double maxSimilarity, int index, Dictionary <string, double> equalMinDistanceDictWordList) { try { double distancethreshold = 0.3; index = index - _minWordLength; double NewDistance = 0; int WordLenght = word.Length; if ((WordLenght + index) < 0) { return(maxSimilarity); } if ((WordLenght + index) >= _IndexDictionary.Length) { return(maxSimilarity); } if (_IndexDictionary[WordLenght - 1 + index] == null) { return(0); } for (int j = 0; j < _IndexDictionary[WordLenght - 1 + index].Count; j++) { JaroWinklerDistance JaroDist = new JaroWinklerDistance(); NGramDistance ng = new NGramDistance(); JaccardDistance jd = new JaccardDistance(); string temp = _IndexDictionary[WordLenght - 1 + index][j]; NewDistance = jd.GetDistance(word, temp); double NewDistance2 = -1; if (NewDistance < NewDistance2) { NewDistance = NewDistance2; } if (NewDistance > maxSimilarity) { foreach (var item in equalMinDistanceDictWordList.ToList()) { if (item.Value <= NewDistance - distancethreshold) { equalMinDistanceDictWordList.Remove(item.Key); } } tempReplacement = temp; if (!equalMinDistanceDictWordList.ContainsKey(temp)) { equalMinDistanceDictWordList.Add(temp, NewDistance); } //else // equalMinDistanceDictWordList[tempReplacement] = NewDistance; maxSimilarity = NewDistance; } else if (NewDistance <= maxSimilarity + distancethreshold && NewDistance >= maxSimilarity - distancethreshold && NewDistance > 0) { if (!equalMinDistanceDictWordList.ContainsKey(temp)) { equalMinDistanceDictWordList.Add(temp, NewDistance); } } } } catch (Exception e) { throw e; } return(maxSimilarity); }
public static double findSimilarDictionaryWord(string word, double maxSimilarity, int index, List <string> equalMinDistanceDictWordList, bool exact) { index = index - _minWordLength; int WordLength = word.Length; int index2 = index; if (index < 0 || (WordLength >= 2 && char.IsUpper(word[0]) && !char.IsUpper(word[1]))) { index2 = 0; } word = word.ToLower(); bool noSpace = false; if (word.CompareTo(word.Trim()) == 0) { noSpace = true; } else { word = word.Trim(); } double NewSimilarity = 0; if ((WordLength + index) < 0) { return(maxSimilarity); } if ((WordLength + index) >= _IndexDictionary.Length) { return(maxSimilarity); } if (_IndexDictionary[WordLength + index] == null) { return(maxSimilarity); } for (int j = 0; j < _IndexDictionary[WordLength + index].Count; j++) { JaroWinklerDistance JaroDist = new JaroWinklerDistance(); NGramDistance ng = new NGramDistance(); JaccardDistance jd = new JaccardDistance(); string temp = _IndexDictionary[WordLength + index][j]; if (noSpace && temp.CompareTo(word) == 0) { equalMinDistanceDictWordList.Clear(); equalMinDistanceDictWordList.Add(temp); return(10); } else if (temp.Contains(word)) { equalMinDistanceDictWordList.Add(/*item);*/ temp); maxSimilarity = 1; } else if (index <= 2) { for (int i = 0; i <= index2; i++) { string s = temp.Substring(i); string s2 = temp.Substring(0, temp.Length - index2); //Console.WriteLine(item); if (!exact) { NewSimilarity = Math.Max(jd.GetDistance(word, s), jd.GetDistance(word, s2)); } else { NewSimilarity = jd.GetDistance(word, temp); if (NewSimilarity == 1) { equalMinDistanceDictWordList.Clear(); equalMinDistanceDictWordList.Add(s); maxSimilarity = NewSimilarity; } return(maxSimilarity); } if (NewSimilarity > .33) { //equalMinDistanceDictWordList.Clear(); equalMinDistanceDictWordList.Add(/*item);*/ temp); maxSimilarity = NewSimilarity; break; } } } } return(maxSimilarity); }