Esempio n. 1
0
        public static Dictionary <string, Tuple <T1, T1, double> > getActualPairsDictionary <T1>(Dictionary <T1, List <int> > wordList, double threshold)
        {
            Dictionary <string, Tuple <T1, T1, double> > pairsDictionary = new Dictionary <string, Tuple <T1, T1, double> >();
            List <T1> docList = wordList.Keys.ToList();
            int       i, j;
            string    sum;
            double    jaccard;

            for (i = 0; i < docList.Count; i++)
            {
                for (j = i + 1; j < docList.Count; j++)
                {
                    //sum = docList[i] + "#" + docList[j];
                    sum = getKeyFromPair(docList[i], docList[j]);
                    if (!pairsDictionary.ContainsKey(sum))
                    {
                        jaccard = Jaccard.Calc(wordList[docList[i]], wordList[docList[j]]);
                        if (jaccard >= threshold)
                        {
                            pairsDictionary.Add(sum, new Tuple <T1, T1, double>(docList[i], docList[j], jaccard));
                        }
                    }
                }
            }
            return(pairsDictionary);
        }
Esempio n. 2
0
        public static double calculateMinHashFunctionsAccuracy <T1>(Dictionary <T1, List <int> > wordListActual, Dictionary <T1, List <uint> > wordListMinHash)
        {
            List <T1> docList = wordListActual.Keys.ToList();
            int       i, j;
            double    jaccard_actual, jaccard_minhash;
            double    total_diff_perc = 0;
            double    diff_perc;
            int       pair_count = 0;

            for (i = 0; i < docList.Count; i++)
            {
                for (j = i + 1; j < docList.Count; j++)
                {
                    jaccard_actual = Jaccard.Calc(wordListActual[docList[i]], wordListActual[docList[j]]);
                    if (jaccard_actual > 0)
                    {
                        jaccard_minhash = Jaccard.Calc(wordListMinHash[docList[i]],
                                                       wordListMinHash[docList[j]]);
                        diff_perc        = (Math.Abs(jaccard_minhash - jaccard_actual) / jaccard_actual) * 100;
                        total_diff_perc += diff_perc;
                        pair_count++;
                    }
                }
            }
            double avg_diff_perc = total_diff_perc / pair_count;

            Console.WriteLine("Average diff from Actual and MinHash Jaccard is: " + avg_diff_perc + " %");
            return(avg_diff_perc);
        }
Esempio n. 3
0
        public static void JaccardTest1()
        {
            Dictionary <int, string> wordDict = new Dictionary <int, string>();

            wordDict.Add(1, "Word1");
            wordDict.Add(2, "Word2");
            wordDict.Add(3, "Word3");
            wordDict.Add(4, "Word4");

            List <int> doc1 = new List <int>();

            doc1.Add(2);
            doc1.Add(3);
            doc1.Add(4);
            doc1.Add(2);

            List <int> doc2 = new List <int>();

            doc2.Add(1);
            doc2.Add(5);
            doc2.Add(4);
            doc2.Add(2);

            List <int> doc3 = new List <int>();

            doc3.Add(1);

            Console.WriteLine("Jaccard: " + Jaccard.Calc(doc1, doc2));
            Console.WriteLine("Jaccard: " + Jaccard.Calc(doc1, doc1));
            Console.WriteLine("Jaccard: " + Jaccard.Calc(doc1, doc3));
        }
Esempio n. 4
0
        static void processNumbersTest3(bool sample = false, int sample_size = 1000)
        {
            int    numHashFunctions = 128;
            int    universeSize     = 1000;
            double simThreshold     = 0.65;
            double atn = 0.05;

            MinHasher2 mh2 = new MinHasher2(numHashFunctions, simThreshold);

            NumberDocumentCreator numDocCreator2 = new NumberDocumentCreator(10, universeSize);

            int[] a1 = numDocCreator2.createDocument(universeSize);
            int[] a2 = numDocCreator2.createDocument(universeSize);

            Console.WriteLine("Actual jaccaard: " + MinHasher2.calculateJaccard(a1, a2));
            Console.WriteLine("MinHash jaccaard: " + MinHasher2.calculateJaccard(mh2.getMinHashSignature(a1), mh2.getMinHashSignature(a2)));

            return;

            MinHasher3         mh  = new MinHasher3(universeSize, numHashFunctions);
            MinHasher_Buckets3 mhb = new MinHasher_Buckets3(mh, simThreshold, atn);

            NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, universeSize);

            List <int> s1 = numDocCreator.createDocument(universeSize).ToList();
            List <int> s2 = numDocCreator.createDocument(universeSize).ToList();

            Console.WriteLine("Actual jaccaard: " + Jaccard.Calc(s1, s2));
            Console.WriteLine("MinHash jaccaard: " + Jaccard.Calc(mh.GetMinHash(s1), mh.GetMinHash(s2)));
            return;

            Dictionary <int, List <int> > wordList = numDocCreator.documentCollectionList;

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            Dictionary <int, List <uint> > docMinhashes          = mhb.createMinhashCollection(wordList); //minHasher.createMinhashCollection(wordList);
            double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);

            /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000);
             *
             * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection;
             *
             * //Now create a MinHasher object to minhash each of the documents created above
             * //using 300 unique hashing functions.
             * //MinHasher minHasher = new MinHasher(500, 5);
             * Console.WriteLine("\r\nGenerating MinHash signatures ... ");
             * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2);
             * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2);
             */

            Console.ReadKey();
        }
        /*
         *  Graph::generateCommonPairs
         *      Generate the list of Vertex pairs that share common properties. The pairs in this list will be input to the OurSim calculations
         *      We are doing this to recover from n square complexity
         */
        public Dictionary <string, Tuple <T1, T1, double> > generateVertexPairs <T1, T>(Dictionary <string, HashSet <T1> > m_lshBuckets, Dictionary <T1, List <uint> > docMinhashes, Dictionary <T1, List <int> > wordList, bool exclude_sim_under_threshold, string output_file_name)
        {
            //Dictionary<string, HashSet<int>> m_lshBuckets = new Dictionary<string, HashSet<int>>();
            Dictionary <string, Tuple <T1, T1, double> > pairsDictionary = new Dictionary <string, Tuple <T1, T1, double> >();
            List <T1>    docList;
            string       sum;
            int          loopCount = 0;
            double       jaccard;
            int          bucketIndex    = 0;
            StreamWriter wr             = null;
            string       sep            = " #-# ";
            string       temp_file_name = output_file_name + ".temp";

            if (output_file_name != null)
            {
                wr = new StreamWriter(temp_file_name); //write the pairs to a file
                wr.WriteLine("-common_pairs-");
                wr.WriteLine(string.Format("vid1{0}vid2{0}minhash_sim", sep));
            }
            foreach (var bucket in m_lshBuckets)
            {
                bucketIndex++;
                if (bucket.Value.Count <= 1)
                {
                    continue;
                }
                docList = bucket.Value.ToList();
                int i = 0;
                int j = i + 1;
                for (i = 0; i < docList.Count; i++)
                {
                    for (j = i + 1; j < docList.Count; j++)
                    {
                        //sum = docList[i] + "#" + docList[j];
                        sum = Util.getKeyFromPair(docList[i], docList[j]);
                        if (!pairsDictionary.ContainsKey(sum))
                        {
                            //jaccard = calculateJaccard(docMinhashes[docList[i]], docMinhashes[docList[j]]);
                            jaccard = Jaccard.Calc(wordList[docList[i]], wordList[docList[j]]);
                            if (!exclude_sim_under_threshold || jaccard >= sim_threshold)
                            {
                                pairsDictionary.Add(sum, new Tuple <T1, T1, double>(docList[i], docList[j], jaccard));
                                if (output_file_name != null)
                                {
                                    wr.WriteLine(docList[i] + sep + docList[j] + sep + jaccard);
                                }
                            }
                        }
                        loopCount++;
                    }
                }
            }
            Console.WriteLine("\r\nBucket generating candidate pairs complexity: " + loopCount);
            if (wr != null)
            {
                wr.Close();
                wr = new StreamWriter(output_file_name);
                wr.WriteLine(pairsDictionary.Count); //prepending the size of the pairs. needed for c++ vector space allocation
                StreamReader rd = new StreamReader(temp_file_name);
                string       buf;
                while ((buf = rd.ReadLine()) != null)
                {
                    wr.WriteLine(buf);
                }
                rd.Close();
                wr.Close();
                File.Delete(temp_file_name);
            }
            return(pairsDictionary);
        }
        // Calculates the similarity of two lists of min hash values. Approximately Numerically equivilant to Jaccard Similarity
        public double Similarity(List <uint> l1, List <uint> l2)
        {
            Jaccard jac = new Jaccard();

            return(Jaccard.Calc(l1, l2));
        }
Esempio n. 7
0
        public static void MinHasher3TestFunc1()
        {
            List <int> inums1 = new List <int>();

            inums1.Add(10);
            inums1.Add(8);
            inums1.Add(11);
            inums1.Add(13);
            inums1.Add(2);
            inums1.Add(17);
            inums1.Add(3);
            inums1.Add(1);
            inums1.Add(19);
            inums1.Add(11);
            inums1.Add(100);
            inums1.Add(82);
            inums1.Add(115);
            inums1.Add(13);
            inums1.Add(2);
            inums1.Add(107);
            inums1.Add(3);
            inums1.Add(1);
            inums1.Add(19);
            inums1.Add(110);
            inums1.Add(10);
            inums1.Add(8);
            inums1.Add(110);
            inums1.Add(131);
            inums1.Add(2);
            inums1.Add(173);
            inums1.Add(3);
            inums1.Add(1);
            inums1.Add(19);
            inums1.Add(114);
            inums1.Add(10);
            inums1.Add(8);
            inums1.Add(11);
            inums1.Add(13);
            inums1.Add(2);
            inums1.Add(17);
            inums1.Add(3);
            inums1.Add(1);
            inums1.Add(19);
            inums1.Add(115);
            inums1.Add(10);
            inums1.Add(8);
            inums1.Add(11);
            inums1.Add(133);
            inums1.Add(2);
            inums1.Add(17);
            inums1.Add(3);
            inums1.Add(1);
            inums1.Add(19);
            inums1.Add(11);
            inums1.Add(10);
            inums1.Add(8);
            inums1.Add(11);
            inums1.Add(13);
            inums1.Add(2);
            inums1.Add(17);
            inums1.Add(3);
            inums1.Add(1);
            inums1.Add(19);
            inums1.Add(171);

            List <int> inums2 = new List <int>();

            inums2.Add(1);
            inums2.Add(2);
            inums2.Add(5);
            inums2.Add(9);
            inums2.Add(12);
            inums2.Add(17);
            inums2.Add(13);
            inums2.Add(11);
            inums2.Add(9);
            inums2.Add(10);
            inums2.Add(1);
            inums2.Add(2);
            inums2.Add(5);
            inums2.Add(9);
            inums2.Add(12);
            inums2.Add(17);
            inums2.Add(13);
            inums2.Add(11);
            inums2.Add(9);
            inums2.Add(10);
            inums2.Add(1);
            inums2.Add(2);
            inums2.Add(5);
            inums2.Add(9);
            inums2.Add(12);
            inums2.Add(17);
            inums2.Add(13);
            inums2.Add(151);
            inums2.Add(9);
            inums2.Add(510);
            inums2.Add(1);
            inums2.Add(2);
            inums2.Add(5);
            inums2.Add(9);
            inums2.Add(12);
            inums2.Add(17);
            inums2.Add(13);
            inums2.Add(11);
            inums2.Add(95);
            inums2.Add(10);
            inums2.Add(1);
            inums2.Add(23);
            inums2.Add(5);
            inums2.Add(9);
            inums2.Add(162);
            inums2.Add(17);
            inums2.Add(13);
            inums2.Add(11);
            inums2.Add(93);
            inums2.Add(10);
            inums2.Add(19);
            inums2.Add(23);
            inums2.Add(5);
            inums2.Add(9);
            inums2.Add(12);
            inums2.Add(17);
            inums2.Add(13);
            inums2.Add(141);
            inums2.Add(94);
            inums2.Add(10);

            int         universeSize = Jaccard.unionSize(inums1, inums2);
            MinHasher3  mh           = new MinHasher3(universeSize, 135);
            List <uint> hvs1         = mh.GetMinHash(inums1).ToList();
            List <uint> hvs2         = mh.GetMinHash(inums2).ToList();

            Console.WriteLine();
            Console.WriteLine("Estimated similarity: " + mh.Similarity(hvs1, hvs2));
            Console.WriteLine("Jaccard similarity: " + Jaccard.Calc(inums1, inums2));
            Console.WriteLine("done");
        }