Ejemplo n.º 1
0
        public static Dictionary <string, Tuple <T1, T1, double> > getActualPairsDictionary <T1, T>(Dictionary <T1, T[]> wordList, double threshold)
        {
            Dictionary <string, Tuple <T1, T1, double> > pairsDictionary = new Dictionary <string, Tuple <T1, T1, double> >();
            List <T1> docList = wordList.Keys.ToList();
            int       i, j;
            string    sum;
            double    jaccard;

            for (i = 0; i < docList.Count; i++)
            {
                for (j = i + 1; j < docList.Count; j++)
                {
                    //sum = docList[i] + "#" + docList[j];
                    sum = getKeyFromPair(docList[i], docList[j]);
                    if (!pairsDictionary.ContainsKey(sum))
                    {
                        jaccard = MinHasher2.calculateJaccard(wordList[docList[i]], wordList[docList[j]]);
                        if (jaccard >= threshold)
                        {
                            pairsDictionary.Add(sum, new Tuple <T1, T1, double>(docList[i], docList[j], jaccard));
                        }
                    }
                }
            }
            return(pairsDictionary);
        }
Ejemplo n.º 2
0
        /*
         * Check to see if jaccard(minhashed) = jaccard(actual documents)
         */
        public static double calculateMinHashFunctionsAccuracy <T1, T>(Dictionary <T1, T[]> wordListActual, Dictionary <T1, int[]> wordListMinHash)
        {
            List <T1> docList = wordListActual.Keys.ToList();
            int       i, j;
            double    jaccard_actual, jaccard_minhash;
            double    total_diff_perc = 0;
            double    diff_perc;
            int       pair_count = 0;

            for (i = 0; i < docList.Count; i++)
            {
                for (j = i + 1; j < docList.Count; j++)
                {
                    jaccard_actual = MinHasher2.calculateJaccard(wordListActual[docList[i]], wordListActual[docList[j]]);
                    if (jaccard_actual > 0)
                    {
                        jaccard_minhash = MinHasher2.calculateJaccard(wordListMinHash[docList[i]],
                                                                      wordListMinHash[docList[j]]);
                        diff_perc        = (Math.Abs(jaccard_minhash - jaccard_actual) / jaccard_actual) * 100;
                        total_diff_perc += diff_perc;
                        pair_count++;
                    }
                }
            }
            double avg_diff_perc = total_diff_perc / pair_count;

            Console.WriteLine("Average diff from Actual and MinHash Jaccard is: " + avg_diff_perc + " %");
            return(avg_diff_perc);
        }
Ejemplo n.º 3
0
        static void processNumbersTest(bool sample = false, int sample_size = 1000)
        {
            //to try minhash on news corpora file
            string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt";
            string pair_output_filename = file + "_minhashpairs.txt";

            int        numHashFunctions            = 2000;
            double     simThreshold                = 0.65;
            bool       exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);

            NumberDocumentCreator   numDocCreator = new NumberDocumentCreator(10, 100000);
            Dictionary <int, int[]> wordList      = numDocCreator.documentCollection;

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);
            double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);

            /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000);
             *
             * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection;
             *
             * //Now create a MinHasher object to minhash each of the documents created above
             * //using 300 unique hashing functions.
             * //MinHasher minHasher = new MinHasher(500, 5);
             * Console.WriteLine("\r\nGenerating MinHash signatures ... ");
             * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2);
             * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2);
             */

            Console.ReadKey();
        }
Ejemplo n.º 4
0
        static void processNumbersTest3(bool sample = false, int sample_size = 1000)
        {
            int    numHashFunctions = 128;
            int    universeSize     = 1000;
            double simThreshold     = 0.65;
            double atn = 0.05;

            MinHasher2 mh2 = new MinHasher2(numHashFunctions, simThreshold);

            NumberDocumentCreator numDocCreator2 = new NumberDocumentCreator(10, universeSize);

            int[] a1 = numDocCreator2.createDocument(universeSize);
            int[] a2 = numDocCreator2.createDocument(universeSize);

            Console.WriteLine("Actual jaccaard: " + MinHasher2.calculateJaccard(a1, a2));
            Console.WriteLine("MinHash jaccaard: " + MinHasher2.calculateJaccard(mh2.getMinHashSignature(a1), mh2.getMinHashSignature(a2)));

            return;

            MinHasher3         mh  = new MinHasher3(universeSize, numHashFunctions);
            MinHasher_Buckets3 mhb = new MinHasher_Buckets3(mh, simThreshold, atn);

            NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, universeSize);

            List <int> s1 = numDocCreator.createDocument(universeSize).ToList();
            List <int> s2 = numDocCreator.createDocument(universeSize).ToList();

            Console.WriteLine("Actual jaccaard: " + Jaccard.Calc(s1, s2));
            Console.WriteLine("MinHash jaccaard: " + Jaccard.Calc(mh.GetMinHash(s1), mh.GetMinHash(s2)));
            return;

            Dictionary <int, List <int> > wordList = numDocCreator.documentCollectionList;

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            Dictionary <int, List <uint> > docMinhashes          = mhb.createMinhashCollection(wordList); //minHasher.createMinhashCollection(wordList);
            double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);

            /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000);
             *
             * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection;
             *
             * //Now create a MinHasher object to minhash each of the documents created above
             * //using 300 unique hashing functions.
             * //MinHasher minHasher = new MinHasher(500, 5);
             * Console.WriteLine("\r\nGenerating MinHash signatures ... ");
             * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2);
             * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2);
             */

            Console.ReadKey();
        }
Ejemplo n.º 5
0
        public double calculatePrecision_fromActualSimilarity(Dictionary <T1, T[]> documents, double threshold) //precision from real jaccard of the pairs
        {
            Tuple <T1, T1, double> t;
            T1  i, j;
            int correct_pairs = 0;

            foreach (string key in pairsDictionary.Keys)
            {
                t = pairsDictionary[key];
                i = t.Item1;
                j = t.Item2;
                if (MinHasher2.calculateJaccard(documents[i], documents[j]) >= threshold)
                {
                    correct_pairs++;
                }
            }
            this.precision_from_actualSimilarity = (double)correct_pairs / (double)pairsDictionary.Count;
            Console.WriteLine("Precision percentage(from actual similarity) is: " + precision_from_actualSimilarity * 100 + "%");
            return(this.precision_from_actualSimilarity);
        }
Ejemplo n.º 6
0
        static void generatePairsFileForRoleSim()
        {
            //to generate pair file for role-sim jaccard
            //string rdf_flat_file = @"../../input\infobox_properties_100000_flat.txt";
            //string rdf_flat_file = @"C:\Users\maydar\Documents\Visual Studio 2013\Projects\clean-v1-opt1\data-sets\university\sparql_university_4.txt_flat.txt";
            //string rdf_flat_file = @"C:\Users\maydar\Documents\Visual Studio 2013\Projects\clean-v1-opt1\data-sets\Lubm\university_all.txt_flat.txt";

            string rdf_flat_file =
                @"C:\Users\maydar\Documents\Sony Backup\PHD\SEMANTIC STUDY\dbpedia\infobox\infobox_properties_10000000_flat.txt";
            string pair_output_filename        = rdf_flat_file + "_minhashpairs.txt";
            int    numHashFunctions            = 250;
            double simThreshold                = 0.33;
            bool   exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            //MinHasher minHasher = new MinHasher(numHashFunctions, simThreshold);
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw = new Stopwatch();
            //Create a collection of n docuents with a max length of 1000 tokens

            /*NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, 10000);
             * //Create a single test document
             * int[] testDoc = numDocCreator.createDocument(10000);*/

            //StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000);
            //Create a single test document
            //string[] testDoc = strDocCreator.createDocument(10000);

            /*int testDocIndex = 1;
             * string[] testDoc = strDocCreator.documentCollection[testDocIndex];
             * double entireCount = testDoc.Length;*/

            FlatInputReader flatInputReader = new FlatInputReader(rdf_flat_file);

            Console.WriteLine(" ");

            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw.Restart();
            Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <int> > m_lshBuckets = minHasher.createBandBuckets(flatInputReader.vertexLabelList, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(flatInputReader.vertexLabelList.Count, 3) / 5);

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <int, int, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, flatInputReader.vertexLabelList, exclude_sim_under_threshold, pair_output_filename);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nBucket pairsDictionary size: " + pairsDictionary.Count);

            Console.ReadKey();
        }
Ejemplo n.º 7
0
        static void processNewsCorporaFiles(bool sample = false, int sample_size = 1000)
        {
            //to try minhash on news corpora file
            //string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt";
            string file = dataset_main_location + @"\news aggregator\newsCorpora.csv-clean.txt";
            string pair_output_filename = file + "_minhashpairs.txt";

            int        numHashFunctions            = 130;
            double     simThreshold                = 0.65;
            bool       exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);
            Dictionary <int, string[]> wordList;

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw0 = new Stopwatch();
            Stopwatch sw  = new Stopwatch();

            int[]  index_locations = { 0, 1, 2 };
            string sep             = @"\t";
            int    limit           = -1;

            if (sample)
            {
                limit = sample_size;
                Console.WriteLine("Sample size: " + sample_size);
            }
            SepInputReader <int, string> sepInputReader = new SepInputReader <int, string>(file, index_locations, sep, false, limit);
            Dictionary <int, string>     groundTruth    = sepInputReader.groundTruth;

            //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);
            wordList = sepInputReader.wordList;
            Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count));
            long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2);

            /*if (!sample)
             *  wordList = sepInputReader.wordList;
             * else
             * {
             *  wordList = Util.getSampleFromDict(sepInputReader.wordList, sample_size);
             * }*/

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw0.Restart();
            sw.Restart();

            Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);

            if (sample)
            {
                //double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);
            }

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <int> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5);

            /*
             * sw.Restart();
             * Console.WriteLine("\r\nListing buckets sizes ... ");
             * minHasher.listBucketSizes(m_lshBuckets, pair_output_filename);
             * Console.WriteLine("Listing buckets sizes in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
             * sw.Stop();*/

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <int, int, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            sw0.Stop();
            Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff"));

            int    foundPairsCount = pairsDictionary.Count;
            double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0;

            Cluster <int, string> cls = new Cluster <int, string>(pairsDictionary, groundTruth);

            //cls.generateClusers1();
            //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth();
            sw.Restart();
            double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold);

            Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            if (sample && sample_size <= 10000)
            {
                sw.Restart();
                Console.WriteLine("Calculating recall from actual should be pairs:");
                Dictionary <string, Tuple <int, int, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold);
                double recall = Util.calculateRecall <int>(actualPairsDictionary, pairsDictionary);
                Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
                sw.Stop();

                /*Dictionary<string, Tuple<int, int, double>> actualMinHashPairsDictionary = Util.getActualPairsDictionary(docMinhashes, simThreshold);
                 * Console.WriteLine("Calculating recall from actual MinHash pairs:");
                 * recall = Util.calculateRecall<int>(actualMinHashPairsDictionary, pairsDictionary);*/

                int a = 0;
            }

            Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount));
            Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount));
            Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage));

            int x = 1;

            Console.ReadKey();
        }
Ejemplo n.º 8
0
        static void processNewsCorporaFiles_InstanceMatch(bool sample = false, int sample_size = 1000)
        {
            //to try minhash on news corpora file
            string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt";
            string pair_output_filename = file + "_minhashpairs.txt";

            int        numHashFunctions            = 130;
            double     simThreshold                = 0.65;
            bool       exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);
            Dictionary <int, string[]>    wordList1, wordList2;
            Dictionary <string, string[]> wordList3;

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw = new Stopwatch();

            int[]  index_locations = { 0, 1, 2 };
            string sep             = @"\t";
            int    limit           = -1;

            if (sample)
            {
                limit = sample_size;
            }

            SepInputReader <int, string> sepInputReader1 = new SepInputReader <int, string>(file, index_locations, sep, false, limit);

            wordList1 = sepInputReader1.wordList;
            SepInputReader <int, string> sepInputReader2 = new SepInputReader <int, string>(file, index_locations, sep, false, limit);

            wordList2 = sepInputReader2.wordList;

            Console.WriteLine("\r\nMerging the two wordLists ... ");
            wordList3 = Util.mergeTwoWordLists(wordList1, wordList2);

            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw.Restart();
            Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList3);

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList3, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList3.Count, 3) / 5);

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <string, string, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList3, exclude_sim_under_threshold, null,
                                                                                                                 true);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null);

            sw.Restart();
            double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList3, simThreshold);

            Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            if (sample)
            {
                sw.Restart();
                Console.WriteLine("Calculating recall from actual should be pairs:");
                Dictionary <string, Tuple <string, string, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList3, simThreshold);
                double recall = Util.calculateRecall <string>(actualPairsDictionary, pairsDictionary);
                Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
                sw.Stop();

                /*Dictionary<string, Tuple<int, int, double>> actualMinHashPairsDictionary = Util.getActualPairsDictionary(docMinhashes, simThreshold);
                 * Console.WriteLine("Calculating recall from actual MinHash pairs:");
                 * recall = Util.calculateRecall<int>(actualMinHashPairsDictionary, pairsDictionary);*/

                int a = 0;
            }

            int x = 1;

            Console.ReadKey();
        }
Ejemplo n.º 9
0
        private static void processSpimbenchFiles_InstanceMatch(bool sample = false, int sample_size = 1000)
        {
            Console.WriteLine("Processing Spimbench_large ...");
            string file1   = dataset_main_location + @"\IM2016_Spimbench_large\Abox1.nt";
            string file2   = dataset_main_location + @"\IM2016_Spimbench_large\Abox2.nt";
            string file_gt =
                dataset_main_location + @"\IM2016_Spimbench_large\refalign.rdf";
            int    numHashFunctions = 128;
            double simThreshold     = 0.3;

            //ground truth file
            string pair_output_filename = file1 + "_minhashpairs.txt";
            string prefix1 = "|first|", prefix2 = "|second|", sep_prefix = "-";


            bool exclude_sim_under_threshold = false;
            //vertex pairs which have estimated similarity under the threshold will be excluded if set
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);
            Dictionary <string, string[]> wordList;

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw    = new Stopwatch();
            Stopwatch sw0   = new Stopwatch();
            int       limit = -1;

            if (sample)
            {
                limit = sample_size;
                Console.WriteLine("Sample size: " + sample_size);
            }
            UobmInputReader uobmInputReader = new UobmInputReader(file1, file2, file_gt, limit, prefix1, prefix2,
                                                                  sep_prefix);

            wordList = uobmInputReader.wordList;
            Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count));

            //long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2);
            long possiblePairCount = uobmInputReader.possiblePairsCount;

            sw0.Restart();
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw.Restart();
            Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5);

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <string, string, double> > pairsDictionary =
                minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null,
                                              true, prefix1, prefix2, sep_prefix);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " +
                              sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();
            sw0.Stop();
            Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff"));
            int    foundPairsCount = pairsDictionary.Count;
            double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0;

            Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null);

            sw.Restart();
            double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold);

            Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            sw.Restart();
            Console.WriteLine("Calculating recall from ground truth:");
            double recall = Util.calculateRecall <string>(uobmInputReader.gtPairsDictionary, pairsDictionary);

            Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " +
                              sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();
            double fmeasure = 2 * ((precision_from_actualSimilarity * recall) / (precision_from_actualSimilarity + recall));

            Console.WriteLine("F-measure: " + fmeasure);

            Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount));
            Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount));
            Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage));
            Console.ReadKey();
        }
Ejemplo n.º 10
0
        static void processAmazonJsonDumpFiles(bool sample = false, int sample_size = 1000)
        {
            Console.WriteLine("Amazon meta data will be made available (for research purposes) on request. Please contact Julian McAuley ([email protected]) to obtain a link.");
            //to try minhash on amazon json dump files
            string amz_json_file = @"C:\Users\maydar\Documents\Sony Backup\PROJECTS\amazon\review-dumps\test\meta_Office_Products.json.gz";

            string pair_output_filename = amz_json_file + "_minhashpairs.txt";


            int    numHashFunctions            = 130;
            double simThreshold                = 0.65;
            bool   exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            //MinHasher minHasher = new MinHasher(numHashFunctions, simThreshold);
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);
            Dictionary <string, string[]> wordList;

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw  = new Stopwatch();
            Stopwatch sw0 = new Stopwatch();

            int limit = -1;

            if (sample)
            {
                limit = sample_size;
                Console.WriteLine("Sample size: " + sample_size);
            }
            AmazonJsonInputReader amzInputReader = new AmazonJsonInputReader(amz_json_file, false, limit);

            //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);

            /*if (!sample)
             *  wordList = amzInputReader.productWordList;
             * else
             * {
             *  wordList = Util.getSampleFromDict(amzInputReader.productWordList, sample_size);
             * }*/

            wordList = amzInputReader.productWordList;
            Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count));
            long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2);

            Console.WriteLine(" ");

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw0.Restart();
            sw.Restart();

            //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);
            Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5);

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <string, string, double> > pairsDictionary =
                minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            sw0.Stop();
            Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff"));
            int    foundPairsCount = pairsDictionary.Count;
            double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0;

            Console.WriteLine("\r\nBucket pairsDictionary size: " + pairsDictionary.Count);

            Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null);
            //cls.generateClusers1();
            //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth();
            double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold);

            if (sample && limit <= 50000)
            {
                sw.Restart();
                Console.WriteLine("Calculating recall from actual should be pairs:");
                Dictionary <string, Tuple <string, string, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold);
                double recall = Util.calculateRecall <string>(actualPairsDictionary, pairsDictionary);
                Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
                sw.Stop();
                int a = 0;
            }

            Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount));
            Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount));
            Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage));

            Console.ReadKey();
        }