Пример #1
0
        static void processNewsCorporaFiles(bool sample = false, int sample_size = 1000)
        {
            //to try minhash on news corpora file
            //string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt";
            string file = dataset_main_location + @"\news aggregator\newsCorpora.csv-clean.txt";
            string pair_output_filename = file + "_minhashpairs.txt";

            int        numHashFunctions            = 130;
            double     simThreshold                = 0.65;
            bool       exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);
            Dictionary <int, string[]> wordList;

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw0 = new Stopwatch();
            Stopwatch sw  = new Stopwatch();

            int[]  index_locations = { 0, 1, 2 };
            string sep             = @"\t";
            int    limit           = -1;

            if (sample)
            {
                limit = sample_size;
                Console.WriteLine("Sample size: " + sample_size);
            }
            SepInputReader <int, string> sepInputReader = new SepInputReader <int, string>(file, index_locations, sep, false, limit);
            Dictionary <int, string>     groundTruth    = sepInputReader.groundTruth;

            //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);
            wordList = sepInputReader.wordList;
            Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count));
            long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2);

            /*if (!sample)
             *  wordList = sepInputReader.wordList;
             * else
             * {
             *  wordList = Util.getSampleFromDict(sepInputReader.wordList, sample_size);
             * }*/

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw0.Restart();
            sw.Restart();

            Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);

            if (sample)
            {
                //double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes);
            }

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <int> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5);

            /*
             * sw.Restart();
             * Console.WriteLine("\r\nListing buckets sizes ... ");
             * minHasher.listBucketSizes(m_lshBuckets, pair_output_filename);
             * Console.WriteLine("Listing buckets sizes in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
             * sw.Stop();*/

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <int, int, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            sw0.Stop();
            Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff"));

            int    foundPairsCount = pairsDictionary.Count;
            double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0;

            Cluster <int, string> cls = new Cluster <int, string>(pairsDictionary, groundTruth);

            //cls.generateClusers1();
            //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth();
            sw.Restart();
            double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold);

            Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            if (sample && sample_size <= 10000)
            {
                sw.Restart();
                Console.WriteLine("Calculating recall from actual should be pairs:");
                Dictionary <string, Tuple <int, int, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold);
                double recall = Util.calculateRecall <int>(actualPairsDictionary, pairsDictionary);
                Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
                sw.Stop();

                /*Dictionary<string, Tuple<int, int, double>> actualMinHashPairsDictionary = Util.getActualPairsDictionary(docMinhashes, simThreshold);
                 * Console.WriteLine("Calculating recall from actual MinHash pairs:");
                 * recall = Util.calculateRecall<int>(actualMinHashPairsDictionary, pairsDictionary);*/

                int a = 0;
            }

            Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount));
            Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount));
            Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage));

            int x = 1;

            Console.ReadKey();
        }
Пример #2
0
        static void processAmazonJsonDumpFiles(bool sample = false, int sample_size = 1000)
        {
            Console.WriteLine("Amazon meta data will be made available (for research purposes) on request. Please contact Julian McAuley ([email protected]) to obtain a link.");
            //to try minhash on amazon json dump files
            string amz_json_file = @"C:\Users\maydar\Documents\Sony Backup\PROJECTS\amazon\review-dumps\test\meta_Office_Products.json.gz";

            string pair_output_filename = amz_json_file + "_minhashpairs.txt";


            int    numHashFunctions            = 130;
            double simThreshold                = 0.65;
            bool   exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set
            //MinHasher minHasher = new MinHasher(numHashFunctions, simThreshold);
            MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold);
            Dictionary <string, string[]> wordList;

            Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight *****
            Stopwatch sw  = new Stopwatch();
            Stopwatch sw0 = new Stopwatch();

            int limit = -1;

            if (sample)
            {
                limit = sample_size;
                Console.WriteLine("Sample size: " + sample_size);
            }
            AmazonJsonInputReader amzInputReader = new AmazonJsonInputReader(amz_json_file, false, limit);

            //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);

            /*if (!sample)
             *  wordList = amzInputReader.productWordList;
             * else
             * {
             *  wordList = Util.getSampleFromDict(amzInputReader.productWordList, sample_size);
             * }*/

            wordList = amzInputReader.productWordList;
            Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count));
            long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2);

            Console.WriteLine(" ");

            //Now create a MinHasher object to minhash each of the documents created above
            //using 300 unique hashing functions.
            //MinHasher minHasher = new MinHasher(500, 5);
            Console.WriteLine("\r\nGenerating MinHash signatures ... ");
            sw0.Restart();
            sw.Restart();

            //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList);
            Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList);

            sw.Stop();
            Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));

            sw.Restart();
            Console.WriteLine("\r\nCreating MinHash buckets ... ");
            Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes);

            Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5);

            sw.Restart();
            Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... ");
            Dictionary <string, Tuple <string, string, double> > pairsDictionary =
                minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null);

            Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
            sw.Stop();

            sw0.Stop();
            Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff"));
            int    foundPairsCount = pairsDictionary.Count;
            double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0;

            Console.WriteLine("\r\nBucket pairsDictionary size: " + pairsDictionary.Count);

            Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null);
            //cls.generateClusers1();
            //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth();
            double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold);

            if (sample && limit <= 50000)
            {
                sw.Restart();
                Console.WriteLine("Calculating recall from actual should be pairs:");
                Dictionary <string, Tuple <string, string, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold);
                double recall = Util.calculateRecall <string>(actualPairsDictionary, pairsDictionary);
                Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff"));
                sw.Stop();
                int a = 0;
            }

            Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount));
            Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount));
            Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage));

            Console.ReadKey();
        }