Ejemplo n.º 1
0
        public override float Compare(string str1, string str2)
        {
            string name1 = this.Normalize(str1);
            string name2 = this.Normalize(str2);

            // check if company is shortened like "International Business Machines" -> "IBM"
            string firstChars1 = this.GetFirstCharsFromWords(name1).Join("");
            string firstChars2 = this.GetFirstCharsFromWords(name2).Join("");

            if (firstChars1 == firstChars2)
            {
                // company name is shortened an equals (e.g. "IBM" == "IBM")
                return(0.9f);
            }

            StringFuzzyComparer comparer = new DamerauLevenshteinDistance();
            float similarityShortened    = comparer.Compare(firstChars1, firstChars2);
            float similarityNormal       = comparer.Compare(name1, name2);

            // return what is better: the shortened version vs. the normal version
            return(Math.Max(similarityShortened, similarityNormal));
        }
Ejemplo n.º 2
0
        //private StringBuilder msg = new StringBuilder();

        public override float Compare(string str1, string str2)
        {
            float isMainCityFactor = 1f;
            float similarity       = 0.0f;

            // normalize "Wiesbaden-Dotzheim" -> "wiesbaden-dotzheim"
            string city1 = this.Normalize(str1);
            string city2 = this.Normalize(str2);

            //msg.AppendLine("Normalize1:" + str1 + " -> " + city1);
            //msg.AppendLine("Normalize2:" + str2 + " -> " + city2);

            // "Mainz-Bingen/Bingen" -> "Bingen"
            if (city1.Contains("/"))
            {
                city1            = this.GetLeftPart(city1, "/");
                isMainCityFactor = 0.9f;
            }
            else if (city1.Contains("-"))
            {
                // e.g. "Wiesbaden-Dotzheim"
                // e.g. "Mainz-Bingen"
                city1            = this.GetLeftPart(city1, "-");
                isMainCityFactor = 0.9f;
            }

            if (city2.Contains("/"))
            {
                city2            = this.GetLeftPart(city2, "/");
                isMainCityFactor = 0.9f;
            }
            else if (city2.Contains("-"))
            {
                city2            = this.GetLeftPart(city2, "-");
                isMainCityFactor = 0.9f;
            }

            StringFuzzyComparer comparer = new DamerauLevenshteinDistance();

            similarity = comparer.Compare(city1, city2);

            // reduce similarity, 100% cannot be reached, when one city is only part of the other
            similarity = similarity * isMainCityFactor;

            return(similarity);
        }
Ejemplo n.º 3
0
        public override float Compare(string str1, string str2)
        {
            float similarity = 0.0f;

            string name1 = str1;
            string name2 = str2;

            // check if name is shortened like "Müller" -> "M."
            if (name1.EndsWith(".") || name2.EndsWith("."))
            {
                // normalize "M.-Thurgau" -> "m thurgau"
                name1 = this.Normalize(name1);
                name2 = this.Normalize(name2);

                // take length of the shortened name "M"
                int minLength = Math.Min(name1.Length, name2.Length);
                name1 = name1.TrySubstring(minLength);
                name2 = name2.TrySubstring(minLength);

                StringFuzzyComparer comparer = new DamerauLevenshteinDistance();
                similarity = comparer.Compare(name1, name2);

                // reduce similarity, 100% cannot be reached, when one name is shortened
                similarity = similarity * 0.8f;
            }
            else
            {
                // normalize "M.-Thurgau" -> "m thurgau"
                name1 = this.Normalize(name1);
                name2 = this.Normalize(name2);

                StringFuzzyComparer comparer = new DamerauLevenshteinDistance();
                similarity = comparer.Compare(name1, name2);

                // Reduce the score if the first letters don't match
                //if (name1.CharAt(0) != name2.CharAt(0))
                //{
                //    similarity = Math.Min(similarity, MAX_SCORE_FOR_NO_FIRST_LETTER_MATCH);
                //}
            }

            return(similarity);
        }
Ejemplo n.º 4
0
        private void StringCompareTest(string input, string[] testCases)
        {
            Debug.WriteLine("Dice Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer dice = new DiceCoefficent();
                double diceValue         = dice.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Jaccard Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer jaccard = new Jaccard();
                double jaccardValue         = jaccard.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer exjaccard = new ExtendedJaccard();
                double exjaccardValue         = exjaccard.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer lev = new DamerauLevenshteinDistance();
                var levenStein          = lev.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", levenStein, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("JaroWinkler for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer jw = new JaroWinkler();
                var jwValue            = jw.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", jwValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Monge-Elkan for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer me = new MongeElkan();
                var meValue            = me.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", meValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("NGramDistance(2) for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer ngram2 = new NGramDistance();
                (ngram2 as NGramDistance).NGramLength = 2;
                var ngramValue2 = ngram2.Compare(input, name);

                Debug.WriteLine("\t{0}, against {1}", ngramValue2, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("SmithWaterman for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer sw = new SmithWaterman();
                var swValue            = sw.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", swValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Extended Editex for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer edx = new ExtendedEditex();
                var edxValue            = edx.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", edxValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Longest Common Subsequence for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer lcs = new LongestCommonSubsequence();
                var lcsValue            = lcs.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
        }