public override float Compare(string str1, string str2) { string name1 = this.Normalize(str1); string name2 = this.Normalize(str2); // check if company is shortened like "International Business Machines" -> "IBM" string firstChars1 = this.GetFirstCharsFromWords(name1).Join(""); string firstChars2 = this.GetFirstCharsFromWords(name2).Join(""); if (firstChars1 == firstChars2) { // company name is shortened an equals (e.g. "IBM" == "IBM") return(0.9f); } StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); float similarityShortened = comparer.Compare(firstChars1, firstChars2); float similarityNormal = comparer.Compare(name1, name2); // return what is better: the shortened version vs. the normal version return(Math.Max(similarityShortened, similarityNormal)); }
//private StringBuilder msg = new StringBuilder(); public override float Compare(string str1, string str2) { float isMainCityFactor = 1f; float similarity = 0.0f; // normalize "Wiesbaden-Dotzheim" -> "wiesbaden-dotzheim" string city1 = this.Normalize(str1); string city2 = this.Normalize(str2); //msg.AppendLine("Normalize1:" + str1 + " -> " + city1); //msg.AppendLine("Normalize2:" + str2 + " -> " + city2); // "Mainz-Bingen/Bingen" -> "Bingen" if (city1.Contains("/")) { city1 = this.GetLeftPart(city1, "/"); isMainCityFactor = 0.9f; } else if (city1.Contains("-")) { // e.g. "Wiesbaden-Dotzheim" // e.g. "Mainz-Bingen" city1 = this.GetLeftPart(city1, "-"); isMainCityFactor = 0.9f; } if (city2.Contains("/")) { city2 = this.GetLeftPart(city2, "/"); isMainCityFactor = 0.9f; } else if (city2.Contains("-")) { city2 = this.GetLeftPart(city2, "-"); isMainCityFactor = 0.9f; } StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); similarity = comparer.Compare(city1, city2); // reduce similarity, 100% cannot be reached, when one city is only part of the other similarity = similarity * isMainCityFactor; return(similarity); }
public override float Compare(string str1, string str2) { float similarity = 0.0f; string name1 = str1; string name2 = str2; // check if name is shortened like "Müller" -> "M." if (name1.EndsWith(".") || name2.EndsWith(".")) { // normalize "M.-Thurgau" -> "m thurgau" name1 = this.Normalize(name1); name2 = this.Normalize(name2); // take length of the shortened name "M" int minLength = Math.Min(name1.Length, name2.Length); name1 = name1.TrySubstring(minLength); name2 = name2.TrySubstring(minLength); StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); similarity = comparer.Compare(name1, name2); // reduce similarity, 100% cannot be reached, when one name is shortened similarity = similarity * 0.8f; } else { // normalize "M.-Thurgau" -> "m thurgau" name1 = this.Normalize(name1); name2 = this.Normalize(name2); StringFuzzyComparer comparer = new DamerauLevenshteinDistance(); similarity = comparer.Compare(name1, name2); // Reduce the score if the first letters don't match //if (name1.CharAt(0) != name2.CharAt(0)) //{ // similarity = Math.Min(similarity, MAX_SCORE_FOR_NO_FIRST_LETTER_MATCH); //} } return(similarity); }
private void StringCompareTest(string input, string[] testCases) { Debug.WriteLine("Dice Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer dice = new DiceCoefficent(); double diceValue = dice.Compare(input, name); Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("Jaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jaccard = new Jaccard(); double jaccardValue = jaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer exjaccard = new ExtendedJaccard(); double exjaccardValue = exjaccard.Compare(input, name); Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lev = new DamerauLevenshteinDistance(); var levenStein = lev.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", levenStein, name); } Debug.WriteLine(""); Debug.WriteLine("JaroWinkler for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer jw = new JaroWinkler(); var jwValue = jw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", jwValue, name); } Debug.WriteLine(""); Debug.WriteLine("Monge-Elkan for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer me = new MongeElkan(); var meValue = me.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", meValue, name); } Debug.WriteLine(""); Debug.WriteLine("NGramDistance(2) for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer ngram2 = new NGramDistance(); (ngram2 as NGramDistance).NGramLength = 2; var ngramValue2 = ngram2.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", ngramValue2, name); } Debug.WriteLine(""); Debug.WriteLine("SmithWaterman for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer sw = new SmithWaterman(); var swValue = sw.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", swValue, name); } Debug.WriteLine(""); Debug.WriteLine("Extended Editex for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer edx = new ExtendedEditex(); var edxValue = edx.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", edxValue, name); } Debug.WriteLine(""); Debug.WriteLine("Longest Common Subsequence for {0}:", input); foreach (var name in testCases) { StringFuzzyComparer lcs = new LongestCommonSubsequence(); var lcsValue = lcs.Compare(input, name); Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name); } Debug.WriteLine(""); }