public static Dictionary <string, Tuple <T1, T1, double> > getActualPairsDictionary <T1, T>(Dictionary <T1, T[]> wordList, double threshold) { Dictionary <string, Tuple <T1, T1, double> > pairsDictionary = new Dictionary <string, Tuple <T1, T1, double> >(); List <T1> docList = wordList.Keys.ToList(); int i, j; string sum; double jaccard; for (i = 0; i < docList.Count; i++) { for (j = i + 1; j < docList.Count; j++) { //sum = docList[i] + "#" + docList[j]; sum = getKeyFromPair(docList[i], docList[j]); if (!pairsDictionary.ContainsKey(sum)) { jaccard = MinHasher2.calculateJaccard(wordList[docList[i]], wordList[docList[j]]); if (jaccard >= threshold) { pairsDictionary.Add(sum, new Tuple <T1, T1, double>(docList[i], docList[j], jaccard)); } } } } return(pairsDictionary); }
/* * Check to see if jaccard(minhashed) = jaccard(actual documents) */ public static double calculateMinHashFunctionsAccuracy <T1, T>(Dictionary <T1, T[]> wordListActual, Dictionary <T1, int[]> wordListMinHash) { List <T1> docList = wordListActual.Keys.ToList(); int i, j; double jaccard_actual, jaccard_minhash; double total_diff_perc = 0; double diff_perc; int pair_count = 0; for (i = 0; i < docList.Count; i++) { for (j = i + 1; j < docList.Count; j++) { jaccard_actual = MinHasher2.calculateJaccard(wordListActual[docList[i]], wordListActual[docList[j]]); if (jaccard_actual > 0) { jaccard_minhash = MinHasher2.calculateJaccard(wordListMinHash[docList[i]], wordListMinHash[docList[j]]); diff_perc = (Math.Abs(jaccard_minhash - jaccard_actual) / jaccard_actual) * 100; total_diff_perc += diff_perc; pair_count++; } } } double avg_diff_perc = total_diff_perc / pair_count; Console.WriteLine("Average diff from Actual and MinHash Jaccard is: " + avg_diff_perc + " %"); return(avg_diff_perc); }
static void processNumbersTest(bool sample = false, int sample_size = 1000) { //to try minhash on news corpora file string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt"; string pair_output_filename = file + "_minhashpairs.txt"; int numHashFunctions = 2000; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, 100000); Dictionary <int, int[]> wordList = numDocCreator.documentCollection; //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000); * * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection; * * //Now create a MinHasher object to minhash each of the documents created above * //using 300 unique hashing functions. * //MinHasher minHasher = new MinHasher(500, 5); * Console.WriteLine("\r\nGenerating MinHash signatures ... "); * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2); * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2); */ Console.ReadKey(); }
static void processNumbersTest3(bool sample = false, int sample_size = 1000) { int numHashFunctions = 128; int universeSize = 1000; double simThreshold = 0.65; double atn = 0.05; MinHasher2 mh2 = new MinHasher2(numHashFunctions, simThreshold); NumberDocumentCreator numDocCreator2 = new NumberDocumentCreator(10, universeSize); int[] a1 = numDocCreator2.createDocument(universeSize); int[] a2 = numDocCreator2.createDocument(universeSize); Console.WriteLine("Actual jaccaard: " + MinHasher2.calculateJaccard(a1, a2)); Console.WriteLine("MinHash jaccaard: " + MinHasher2.calculateJaccard(mh2.getMinHashSignature(a1), mh2.getMinHashSignature(a2))); return; MinHasher3 mh = new MinHasher3(universeSize, numHashFunctions); MinHasher_Buckets3 mhb = new MinHasher_Buckets3(mh, simThreshold, atn); NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, universeSize); List <int> s1 = numDocCreator.createDocument(universeSize).ToList(); List <int> s2 = numDocCreator.createDocument(universeSize).ToList(); Console.WriteLine("Actual jaccaard: " + Jaccard.Calc(s1, s2)); Console.WriteLine("MinHash jaccaard: " + Jaccard.Calc(mh.GetMinHash(s1), mh.GetMinHash(s2))); return; Dictionary <int, List <int> > wordList = numDocCreator.documentCollectionList; //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); Dictionary <int, List <uint> > docMinhashes = mhb.createMinhashCollection(wordList); //minHasher.createMinhashCollection(wordList); double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000); * * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection; * * //Now create a MinHasher object to minhash each of the documents created above * //using 300 unique hashing functions. * //MinHasher minHasher = new MinHasher(500, 5); * Console.WriteLine("\r\nGenerating MinHash signatures ... "); * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2); * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2); */ Console.ReadKey(); }
public double calculatePrecision_fromActualSimilarity(Dictionary <T1, T[]> documents, double threshold) //precision from real jaccard of the pairs { Tuple <T1, T1, double> t; T1 i, j; int correct_pairs = 0; foreach (string key in pairsDictionary.Keys) { t = pairsDictionary[key]; i = t.Item1; j = t.Item2; if (MinHasher2.calculateJaccard(documents[i], documents[j]) >= threshold) { correct_pairs++; } } this.precision_from_actualSimilarity = (double)correct_pairs / (double)pairsDictionary.Count; Console.WriteLine("Precision percentage(from actual similarity) is: " + precision_from_actualSimilarity * 100 + "%"); return(this.precision_from_actualSimilarity); }
static void generatePairsFileForRoleSim() { //to generate pair file for role-sim jaccard //string rdf_flat_file = @"../../input\infobox_properties_100000_flat.txt"; //string rdf_flat_file = @"C:\Users\maydar\Documents\Visual Studio 2013\Projects\clean-v1-opt1\data-sets\university\sparql_university_4.txt_flat.txt"; //string rdf_flat_file = @"C:\Users\maydar\Documents\Visual Studio 2013\Projects\clean-v1-opt1\data-sets\Lubm\university_all.txt_flat.txt"; string rdf_flat_file = @"C:\Users\maydar\Documents\Sony Backup\PHD\SEMANTIC STUDY\dbpedia\infobox\infobox_properties_10000000_flat.txt"; string pair_output_filename = rdf_flat_file + "_minhashpairs.txt"; int numHashFunctions = 250; double simThreshold = 0.33; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set //MinHasher minHasher = new MinHasher(numHashFunctions, simThreshold); MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw = new Stopwatch(); //Create a collection of n docuents with a max length of 1000 tokens /*NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, 10000); * //Create a single test document * int[] testDoc = numDocCreator.createDocument(10000);*/ //StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000); //Create a single test document //string[] testDoc = strDocCreator.createDocument(10000); /*int testDocIndex = 1; * string[] testDoc = strDocCreator.documentCollection[testDocIndex]; * double entireCount = testDoc.Length;*/ FlatInputReader flatInputReader = new FlatInputReader(rdf_flat_file); Console.WriteLine(" "); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw.Restart(); Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <int> > m_lshBuckets = minHasher.createBandBuckets(flatInputReader.vertexLabelList, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(flatInputReader.vertexLabelList.Count, 3) / 5); sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <int, int, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, flatInputReader.vertexLabelList, exclude_sim_under_threshold, pair_output_filename); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nBucket pairsDictionary size: " + pairsDictionary.Count); Console.ReadKey(); }
static void processNewsCorporaFiles(bool sample = false, int sample_size = 1000) { //to try minhash on news corpora file //string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt"; string file = dataset_main_location + @"\news aggregator\newsCorpora.csv-clean.txt"; string pair_output_filename = file + "_minhashpairs.txt"; int numHashFunctions = 130; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Dictionary <int, string[]> wordList; Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw0 = new Stopwatch(); Stopwatch sw = new Stopwatch(); int[] index_locations = { 0, 1, 2 }; string sep = @"\t"; int limit = -1; if (sample) { limit = sample_size; Console.WriteLine("Sample size: " + sample_size); } SepInputReader <int, string> sepInputReader = new SepInputReader <int, string>(file, index_locations, sep, false, limit); Dictionary <int, string> groundTruth = sepInputReader.groundTruth; //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); wordList = sepInputReader.wordList; Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count)); long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2); /*if (!sample) * wordList = sepInputReader.wordList; * else * { * wordList = Util.getSampleFromDict(sepInputReader.wordList, sample_size); * }*/ //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw0.Restart(); sw.Restart(); Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); if (sample) { //double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); } sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <int> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5); /* * sw.Restart(); * Console.WriteLine("\r\nListing buckets sizes ... "); * minHasher.listBucketSizes(m_lshBuckets, pair_output_filename); * Console.WriteLine("Listing buckets sizes in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); * sw.Stop();*/ sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <int, int, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); sw0.Stop(); Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff")); int foundPairsCount = pairsDictionary.Count; double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0; Cluster <int, string> cls = new Cluster <int, string>(pairsDictionary, groundTruth); //cls.generateClusers1(); //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth(); sw.Restart(); double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold); Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); if (sample && sample_size <= 10000) { sw.Restart(); Console.WriteLine("Calculating recall from actual should be pairs:"); Dictionary <string, Tuple <int, int, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold); double recall = Util.calculateRecall <int>(actualPairsDictionary, pairsDictionary); Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); /*Dictionary<string, Tuple<int, int, double>> actualMinHashPairsDictionary = Util.getActualPairsDictionary(docMinhashes, simThreshold); * Console.WriteLine("Calculating recall from actual MinHash pairs:"); * recall = Util.calculateRecall<int>(actualMinHashPairsDictionary, pairsDictionary);*/ int a = 0; } Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount)); Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount)); Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage)); int x = 1; Console.ReadKey(); }
static void processNewsCorporaFiles_InstanceMatch(bool sample = false, int sample_size = 1000) { //to try minhash on news corpora file string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt"; string pair_output_filename = file + "_minhashpairs.txt"; int numHashFunctions = 130; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Dictionary <int, string[]> wordList1, wordList2; Dictionary <string, string[]> wordList3; Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw = new Stopwatch(); int[] index_locations = { 0, 1, 2 }; string sep = @"\t"; int limit = -1; if (sample) { limit = sample_size; } SepInputReader <int, string> sepInputReader1 = new SepInputReader <int, string>(file, index_locations, sep, false, limit); wordList1 = sepInputReader1.wordList; SepInputReader <int, string> sepInputReader2 = new SepInputReader <int, string>(file, index_locations, sep, false, limit); wordList2 = sepInputReader2.wordList; Console.WriteLine("\r\nMerging the two wordLists ... "); wordList3 = Util.mergeTwoWordLists(wordList1, wordList2); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw.Restart(); Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList3); sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList3, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList3.Count, 3) / 5); sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <string, string, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList3, exclude_sim_under_threshold, null, true); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null); sw.Restart(); double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList3, simThreshold); Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); if (sample) { sw.Restart(); Console.WriteLine("Calculating recall from actual should be pairs:"); Dictionary <string, Tuple <string, string, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList3, simThreshold); double recall = Util.calculateRecall <string>(actualPairsDictionary, pairsDictionary); Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); /*Dictionary<string, Tuple<int, int, double>> actualMinHashPairsDictionary = Util.getActualPairsDictionary(docMinhashes, simThreshold); * Console.WriteLine("Calculating recall from actual MinHash pairs:"); * recall = Util.calculateRecall<int>(actualMinHashPairsDictionary, pairsDictionary);*/ int a = 0; } int x = 1; Console.ReadKey(); }
private static void processSpimbenchFiles_InstanceMatch(bool sample = false, int sample_size = 1000) { Console.WriteLine("Processing Spimbench_large ..."); string file1 = dataset_main_location + @"\IM2016_Spimbench_large\Abox1.nt"; string file2 = dataset_main_location + @"\IM2016_Spimbench_large\Abox2.nt"; string file_gt = dataset_main_location + @"\IM2016_Spimbench_large\refalign.rdf"; int numHashFunctions = 128; double simThreshold = 0.3; //ground truth file string pair_output_filename = file1 + "_minhashpairs.txt"; string prefix1 = "|first|", prefix2 = "|second|", sep_prefix = "-"; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Dictionary <string, string[]> wordList; Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw = new Stopwatch(); Stopwatch sw0 = new Stopwatch(); int limit = -1; if (sample) { limit = sample_size; Console.WriteLine("Sample size: " + sample_size); } UobmInputReader uobmInputReader = new UobmInputReader(file1, file2, file_gt, limit, prefix1, prefix2, sep_prefix); wordList = uobmInputReader.wordList; Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count)); //long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2); long possiblePairCount = uobmInputReader.possiblePairsCount; sw0.Restart(); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw.Restart(); Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5); sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <string, string, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null, true, prefix1, prefix2, sep_prefix); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); sw0.Stop(); Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff")); int foundPairsCount = pairsDictionary.Count; double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0; Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null); sw.Restart(); double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold); Console.WriteLine("Calculated precision from found pairs in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); sw.Restart(); Console.WriteLine("Calculating recall from ground truth:"); double recall = Util.calculateRecall <string>(uobmInputReader.gtPairsDictionary, pairsDictionary); Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); double fmeasure = 2 * ((precision_from_actualSimilarity * recall) / (precision_from_actualSimilarity + recall)); Console.WriteLine("F-measure: " + fmeasure); Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount)); Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount)); Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage)); Console.ReadKey(); }
static void processAmazonJsonDumpFiles(bool sample = false, int sample_size = 1000) { Console.WriteLine("Amazon meta data will be made available (for research purposes) on request. Please contact Julian McAuley ([email protected]) to obtain a link."); //to try minhash on amazon json dump files string amz_json_file = @"C:\Users\maydar\Documents\Sony Backup\PROJECTS\amazon\review-dumps\test\meta_Office_Products.json.gz"; string pair_output_filename = amz_json_file + "_minhashpairs.txt"; int numHashFunctions = 130; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set //MinHasher minHasher = new MinHasher(numHashFunctions, simThreshold); MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); Dictionary <string, string[]> wordList; Console.BufferHeight = Int16.MaxValue - 1; // ***** Alters the BufferHeight ***** Stopwatch sw = new Stopwatch(); Stopwatch sw0 = new Stopwatch(); int limit = -1; if (sample) { limit = sample_size; Console.WriteLine("Sample size: " + sample_size); } AmazonJsonInputReader amzInputReader = new AmazonJsonInputReader(amz_json_file, false, limit); //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); /*if (!sample) * wordList = amzInputReader.productWordList; * else * { * wordList = Util.getSampleFromDict(amzInputReader.productWordList, sample_size); * }*/ wordList = amzInputReader.productWordList; Console.WriteLine(string.Format("\r\nInstances count: {0}", wordList.Count)); long possiblePairCount = PermutationsAndCombinations.nCr(wordList.Count, 2); Console.WriteLine(" "); //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); sw0.Restart(); sw.Restart(); //Dictionary<int, int[]> docMinhashes = minHasher.createMinhashCollection(flatInputReader.vertexLabelList); Dictionary <string, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); sw.Stop(); Console.WriteLine("Generated MinHash signatures in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Restart(); Console.WriteLine("\r\nCreating MinHash buckets ... "); Dictionary <string, HashSet <string> > m_lshBuckets = minHasher.createBandBuckets(wordList, docMinhashes); Console.WriteLine("Created MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); Console.WriteLine("\r\nComplexity with regular jaccard lookup(estimate): " + Math.Pow(wordList.Count, 3) / 5); sw.Restart(); Console.WriteLine("\r\nGenerating vertex pairs using MinHash buckets ... "); Dictionary <string, Tuple <string, string, double> > pairsDictionary = minHasher.generateVertexPairs(m_lshBuckets, docMinhashes, wordList, exclude_sim_under_threshold, null); Console.WriteLine("Generated vertex pairs using MinHash buckets in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); sw0.Stop(); Console.WriteLine("\r\nTook total time of: " + sw0.Elapsed.ToString("mm\\:ss\\.ff")); int foundPairsCount = pairsDictionary.Count; double prunePercentage = ((double)(possiblePairCount - foundPairsCount) / (double)possiblePairCount) * 100.0; Console.WriteLine("\r\nBucket pairsDictionary size: " + pairsDictionary.Count); Cluster <string, string> cls = new Cluster <string, string>(pairsDictionary, null); //cls.generateClusers1(); //double precision_from_groundTruth = cls.calculatePrecision_fromGroundTruth(); double precision_from_actualSimilarity = cls.calculatePrecision_fromActualSimilarity(wordList, simThreshold); if (sample && limit <= 50000) { sw.Restart(); Console.WriteLine("Calculating recall from actual should be pairs:"); Dictionary <string, Tuple <string, string, double> > actualPairsDictionary = Util.getActualPairsDictionary(wordList, simThreshold); double recall = Util.calculateRecall <string>(actualPairsDictionary, pairsDictionary); Console.WriteLine("Calculated recall from the algorithm vs pairwise-comparison in Time : " + sw.Elapsed.ToString("mm\\:ss\\.ff")); sw.Stop(); int a = 0; } Console.WriteLine(string.Format("\r\nPossible pairs count: {0}", possiblePairCount)); Console.WriteLine(string.Format("\r\nFound pairs count: {0}", foundPairsCount)); Console.WriteLine(string.Format("\r\nPrune percentage: {0}", prunePercentage)); Console.ReadKey(); }