Example #1
0
        private void Ok_Click(object sender, RoutedEventArgs e)
        {
            var format = new NumberFormatInfo();

            format.NegativeSign = "-";
            var valuesCol1 = (from row in mainWindow.gridData.AsEnumerable()
                              select Convert.ToDouble(ConvertString(row.Field <string>(mainWindow.gridData.Columns[comboBoxCol1.SelectedItem.ToString()].ColumnName)), format)).ToArray();

            var valuesCol2 = (from row in mainWindow.gridData.AsEnumerable()
                              select Convert.ToDouble(ConvertString(row.Field <string>(mainWindow.gridData.Columns[comboBoxCol2.SelectedItem.ToString()].ColumnName)), format)).ToArray();

            if (similarityMeasures.SelectedItem.ToString() == "Jaccard")
            {
                var jaccard = new Jaccard();
                textBlock.Text += "\nPodobieństwo miary Jaccard pomiędzy kolumnami " + comboBoxCol1.SelectedItem.ToString() + " i " + comboBoxCol2.SelectedItem.ToString() + " wynosi: " + jaccard.Similarity(valuesCol1, valuesCol2).ToString();
            }
            else if (similarityMeasures.SelectedItem.ToString() == "Korelacja Pearsona")
            {
                var pearsonCorrelation = new PearsonCorrelation();
                textBlock.Text += "\nPodobieństwo miary Korelacja Pearsona pomiędzy kolumnami " + comboBoxCol1.SelectedItem.ToString() + " i " + comboBoxCol2.SelectedItem.ToString() + " wynosi: " + pearsonCorrelation.Similarity(valuesCol1, valuesCol2).ToString();
            }
            else if (similarityMeasures.SelectedItem.ToString() == "Cosinus")
            {
                var cosine = new Cosine();
                textBlock.Text += "\nPodobieństwo miary Cosinus pomiędzy kolumnami " + comboBoxCol1.SelectedItem.ToString() + " i " + comboBoxCol2.SelectedItem.ToString() + " wynosi: " + cosine.Similarity(valuesCol1, valuesCol2).ToString();
            }
        }
Example #2
0
        public CrawlerLink(string address, string contents, IEnumerable <string> links)
        {
            Console.WriteLine("Processing contents...");

            this.Address = address;

            // Store links
            this.Links = new HashSet <string>(links);

            // Tokenize
            Console.Write("Tokenizing... ");
            var tokens = new List <string>(contents.Split(' '));

            Console.WriteLine("Done!");
            Console.Write("Removing short and stop word tokens... ");
            tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token));
            Console.WriteLine("Done!");

            // Generate shingle hashes
            Console.Write("Generating shingle hashes... ");
            var jaccard = new Jaccard();

            this.ShingleHashes = new LinkedList <ulong>(jaccard.HashedShinglifyDocument(tokens.ToArray()));
            Console.WriteLine("Done!");

            // Apply stemming
            Console.Write("Stemming tokens... ");
            var stemmer       = new PorterStemmer();
            var stemmedTokens = new List <string>(tokens.Select(token => stemmer.StemWord(token)));

            this.Tokens = new HashSet <string>(stemmedTokens);
            Console.WriteLine("Done!");

            // Sort elements
            Console.Write("Sorting stemmed tokens... ");
            stemmedTokens.Sort();
            Console.WriteLine("Done!");

            // Get keyword count
            Console.Write("Adding stemmed tokens to dictionary... ");
            var lastKeyword = "";
            var keywords    = new Dictionary <string, int>();

            foreach (var stemmedToken in stemmedTokens)
            {
                if (!stemmedToken.Equals(lastKeyword))
                {
                    lastKeyword            = stemmedToken;
                    keywords[stemmedToken] = 1;
                }
                else
                {
                    keywords[stemmedToken] += 1;
                }
            }

            this.Keywords = keywords;
            Console.WriteLine("Done!");
        }
Example #3
0
        public void TestDistance()
        {
            var instance = new Jaccard(k: 2);

            var result = instance.Distance("ABCDE", "ABCDF");

            Assert.Equal(expected: 0.4, actual: result);
        }
Example #4
0
        private static void CalculateLshForListingSet(List <Listing> listings, string job_id, Dictionary <long, long> duplicates)
        {
            var numSimilarityBuckets = (int)Math.Ceiling(listings.Count / 100M);

            // First make 2 dimensional array (docs by min-hashes)
            var matrix = new int[listings.Count, minHashCount];

            for (int listing = 0; listing < listings.Count; listing++)
            {
                for (int hash = 0; hash < listings[listing].minhash_description.Count; hash++)
                {
                    matrix[listing, hash] = (int)listings[listing].minhash_description[hash];
                }
            }

            // Now set LSH
            var lsh = new LSH(matrix, numSimilarityBuckets);

            lsh.Calc();

            // Set closes duplicate on each listing
            var duplicatesFound    = new Dictionary <long, long>();
            var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            for (int listing = 0; listing < listings.Count; listing++)
            {
                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);

                var nearest = lsh.GetNearest(listing);
                if (!nearest.Any())
                {
                    continue;
                }

                var thisListing    = listings[listing];
                var nearestListing = listings[nearest[0]];

                var priceRatio = nearestListing.buy_now_price / thisListing.buy_now_price;
                if (priceRatio < 0.8M || priceRatio > 1.2M)
                {
                    continue;
                }

                if (duplicatesFound.ContainsKey(nearestListing.id))
                {
                    continue;
                }

                listings[listing].likely_duplicate_id_by_description = nearestListing.id;
                listings[listing].similarity_description             = Jaccard.Calc(ArrayHelpers.GetRow <int>(matrix, listing).ToList(), nearest);
                duplicates[nearestListing.id] = thisListing.id;
                duplicates[thisListing.id]    = nearestListing.id;
            }
        }
Example #5
0
        public void TestSimilarity()
        {
            var instance = new Jaccard(k: 2);

            // AB BC CD DE DF
            // 1  1  1  1  0
            // 1  1  1  0  1
            // => 3 / 5 = 0.6
            var result = instance.Similarity("ABCDE", "ABCDF");

            Assert.Equal(expected: 0.6, actual: result);
        }
Example #6
0
        internal static void Main(string[] args)
        {
            var questionOne = "What is your zip code?";
            var questionTwo = "What is your postal code?";

            ISimilarityAlgorithm jaccardIndexAlgorithm = new Jaccard();
            var jaccardIndex = jaccardIndexAlgorithm.Run(questionOne, questionTwo);

            ISimilarityAlgorithm cosineSimilarityAlgorithm = new Cosine();
            var cosineSimilarity = cosineSimilarityAlgorithm.Run(questionOne, questionTwo);

            Console.WriteLine($"Jaccard similarity: {jaccardIndex}");
            Console.WriteLine($"Cosine similarity: {cosineSimilarity}");

            Console.Read();
        }
Example #7
0
        ///// <summary>
        ///// Calcualtes the Levenshtein distance between two strings
        ///// </summary>
        ///// Source: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C.23
        ///// Explanation: https://en.wikipedia.org/wiki/Levenshtein_distance
        //private Int32 levenshtein(String a, String b)
        //{

        //    if (string.IsNullOrEmpty(a))
        //    {
        //        if (!string.IsNullOrEmpty(b))
        //        {
        //            return b.Length;
        //        }
        //        return 0;
        //    }

        //    if (string.IsNullOrEmpty(b))
        //    {
        //        if (!string.IsNullOrEmpty(a))
        //        {
        //            return a.Length;
        //        }
        //        return 0;
        //    }

        //    Int32 cost;
        //    Int32[,] d = new int[a.Length + 1, b.Length + 1];
        //    Int32 min1;
        //    Int32 min2;
        //    Int32 min3;

        //    for (Int32 i = 0; i <= d.GetUpperBound(0); i += 1)
        //    {
        //        d[i, 0] = i;
        //    }

        //    for (Int32 i = 0; i <= d.GetUpperBound(1); i += 1)
        //    {
        //        d[0, i] = i;
        //    }

        //    for (Int32 i = 1; i <= d.GetUpperBound(0); i += 1)
        //    {
        //        for (Int32 j = 1; j <= d.GetUpperBound(1); j += 1)
        //        {
        //            cost = Convert.ToInt32(!(a[i - 1] == b[j - 1]));

        //            min1 = d[i - 1, j] + 1;
        //            min2 = d[i, j - 1] + 1;
        //            min3 = d[i - 1, j - 1] + cost;
        //            d[i, j] = Math.Min(Math.Min(min1, min2), min3);
        //        }
        //    }

        //    return d[d.GetUpperBound(0), d.GetUpperBound(1)];

        //}

        ///// <summary>
        ///// String-similarity computed with levenshtein-distance
        ///// </summary>
        //private double similarityLevenshtein(string a, string b)
        //{
        //    if (a.Equals(b))
        //    {
        //        return 1.0;
        //    }
        //    else
        //    {
        //        if (!(a.Length == 0 || b.Length == 0))
        //        {
        //            double sim = 1 - (levenshtein(a, b) / Convert.ToDouble(Math.Min(a.Length, b.Length)));
        //            return sim;
        //        }
        //        else
        //            return 0.0;
        //    }
        //}

        ///// <summary>
        ///// String-similarity computed with Dice Coefficient
        ///// </summary>
        ///// Source: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#C.23
        ///// Explanation: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
        //private double similarityDiceCoefficient(string a, string b)
        //{
        //    //Workaround for |a| == |b| == 1
        //    if (a.Length <= 1 && b.Length <= 1)
        //    {
        //        if (a.Equals(b))
        //            return 1.0;
        //        else
        //            return 0.0;
        //    }

        //    HashSet<string> setA = new HashSet<string>();
        //    HashSet<string> setB = new HashSet<string>();

        //    for (int i = 0; i < a.Length - 1; ++i)
        //        setA.Add(a.Substring(i, 2));

        //    for (int i = 0; i < b.Length - 1; ++i)
        //        setB.Add(b.Substring(i, 2));

        //    HashSet<string> intersection = new HashSet<string>(setA);
        //    intersection.IntersectWith(setB);

        //    return (2.0 * intersection.Count) / (setA.Count + setB.Count);
        //}

        /// <summary>
        /// Combines multiple String-similarities with equal weight
        /// </summary>
        private double similarity(string a, string b)
        {
            List <double> similarities = new List <double>();
            double        output       = 0.0;

            var l = new NormalizedLevenshtein();

            similarities.Add(l.Similarity(a, b));
            var jw = new JaroWinkler();

            similarities.Add(jw.Similarity(a, b));
            var jac = new Jaccard();

            similarities.Add(jac.Similarity(a, b));

            foreach (double sim in similarities)
            {
                output += sim;
            }

            return(output / similarities.Count);
        }
Example #8
0
        private void StringCompareTest(string input, string[] testCases)
        {
            Debug.WriteLine("Dice Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer dice = new DiceCoefficent();
                double diceValue         = dice.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Jaccard Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer jaccard = new Jaccard();
                double jaccardValue         = jaccard.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer exjaccard = new ExtendedJaccard();
                double exjaccardValue         = exjaccard.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer lev = new DamerauLevenshteinDistance();
                var levenStein          = lev.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", levenStein, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("JaroWinkler for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer jw = new JaroWinkler();
                var jwValue            = jw.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", jwValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Monge-Elkan for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer me = new MongeElkan();
                var meValue            = me.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", meValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("NGramDistance(2) for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer ngram2 = new NGramDistance();
                (ngram2 as NGramDistance).NGramLength = 2;
                var ngramValue2 = ngram2.Compare(input, name);

                Debug.WriteLine("\t{0}, against {1}", ngramValue2, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("SmithWaterman for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer sw = new SmithWaterman();
                var swValue            = sw.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", swValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Extended Editex for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer edx = new ExtendedEditex();
                var edxValue            = edx.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", edxValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Longest Common Subsequence for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer lcs = new LongestCommonSubsequence();
                var lcsValue            = lcs.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
        }
Example #9
0
 public Form1()
 {
     InitializeComponent();
     DuplicateDetection = new Jaccard();
 }
Example #10
0
        /// <summary>
        /// Calculate Jaccard index for a given input (Wrapper for Accord.NET Jaccard
        /// Distance method)
        /// </summary>
        /// <param name="truth">Array of truth</param>
        /// <param name="predictions">Array of predictions</param>
        /// <returns><see cref="double"/> value of Jaccard index (1 - Jaccard Distance)</returns>
        public static double CalculateJaccardIndex(double[] truth, double[] predictions)
        {
            var jaccard = new Jaccard();

            return(1 - jaccard.Distance(truth, predictions));
        }