static void processNewsCorporaFiles(bool sample = false, int sample_size = 1000) { //to try minhash on news corpora file //string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt"; string file = dataset_main_location + @"\news aggregator\newsCorpora.csv-clean.txt"; string pair_output_filename = file + "_minhashpairs.txt"; int numHashFunctions = 130; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Dictionary <int, string[]> wordList; Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw0 = new Stopwatch(); Stopwatch sw = new Stopwatch(); int[] index_locations = { 0, 1, 2 }; string sep = @"\t"; int limit = -1; if (sample) { limit = sample_size; Console.WriteLine("Sample size: " + sample_size); } SepInputReader <int, string> sepInputReader = new SepInputReader <int, string>(file, index_locations, sep, false, limit); Dictionary <int, string> groundTruth = sepInputReader.groundTruth; //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); wordList = sepInputReader.wordList; Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count)); long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2); /*if (!sample) * wordList = sepInputReader.wordList; * else * { * wordList = Util.getSampleFromDict(sepInputReader.wordList, sample_size); * }*/ //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw0.Restart(); sw.Restart(); Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); if (sample) { //double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); } sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <int> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5); /* * sw.Restart(); * Console.WriteLine("\r\nListing buckets sizes ... "); * minHasher.listBucketSizes(m_lshBuckets, pair_output_filename); * Console.WriteLine("Listing buckets sizes in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); * sw.Stop();*/ sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <int, int, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); sw0.Stop(); Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff")); int foundPairsCount = pairsDictionary.Count; double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0; Cluster <int, string> cls = new Cluster <int, string>(pairsDictionary, groundTruth); //cls.generateClusers1(); //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth(); sw.Restart(); double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold); Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); if (sample && sample_size <= 10000) { sw.Restart(); Console.WriteLine("Calculating recall from actual should be pairs:"); Dictionary <string, Tuple <int, int, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold); double recall = Util.calculateRecall <int>(actualPairsDictionary, pairsDictionary); Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); /*Dictionary<string, Tuple<int, int, double>> actualMinHashPairsDictionary = Util.getActualPairsDictionary(docMinhashes, simThreshold); * Console.WriteLine("Calculating recall from actual MinHash pairs:"); * recall = Util.calculateRecall<int>(actualMinHashPairsDictionary, pairsDictionary);*/ int a = 0; } Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount)); Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount)); Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage)); int x = 1; Console.ReadKey(); }
static void processAmazonJsonDumpFiles(bool sample = false, int sample_size = 1000) { Console.WriteLine("Amazon meta data will be made available (for research purposes) on request. Please contact Julian McAuley ([email protected]) to obtain a link."); //to try minhash on amazon json dump files string amz_json_file = @"C:\Users\maydar\Documents\Sony Backup\PROJECTS\amazon\review-dumps\test\meta_Office_Products.json.gz"; string pair_output_filename = amz_json_file + "_minhashpairs.txt"; int numHashFunctions = 130; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set //MinHasher minHasher = new MinHasher(numHashFunctions, simThreshold); MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Dictionary <string, string[]> wordList; Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw = new Stopwatch(); Stopwatch sw0 = new Stopwatch(); int limit = -1; if (sample) { limit = sample_size; Console.WriteLine("Sample size: " + sample_size); } AmazonJsonInputReader amzInputReader = new AmazonJsonInputReader(amz_json_file, false, limit); //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); /*if (!sample) * wordList = amzInputReader.productWordList; * else * { * wordList = Util.getSampleFromDict(amzInputReader.productWordList, sample_size); * }*/ wordList = amzInputReader.productWordList; Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count)); long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2); Console.WriteLine(" "); //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw0.Restart(); sw.Restart(); //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5); sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <string, string, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); sw0.Stop(); Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff")); int foundPairsCount = pairsDictionary.Count; double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0; Console.WriteLine("\r\nBucket pairsDictionary size: " + pairsDictionary.Count); Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null); //cls.generateClusers1(); //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth(); double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold); if (sample && limit <= 50000) { sw.Restart(); Console.WriteLine("Calculating recall from actual should be pairs:"); Dictionary <string, Tuple <string, string, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold); double recall = Util.calculateRecall <string>(actualPairsDictionary, pairsDictionary); Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); int a = 0; } Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount)); Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount)); Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage)); Console.ReadKey(); }