static void processNumbersTest(bool sample = false, int sample_size = 1000) { //to try minhash on news corpora file string file = @"C:\Users\maydar\Dropbox\Semantic Study\ScabilityPaper\datasets\news aggregator\newsCorpora.csv-clean.txt"; string pair_output_filename = file + "_minhashpairs.txt"; int numHashFunctions = 2000; double simThreshold = 0.65; bool exclude_sim_under_threshold = false; //vertex pairs which have estimated similarity under the threshold will be excluded if set MinHasher2 minHasher = new MinHasher2(numHashFunctions, simThreshold); NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, 100000); Dictionary <int, int[]> wordList = numDocCreator.documentCollection; //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); Dictionary <int, int[]> docMinhashes = minHasher.createMinhashCollection(wordList); double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000); * * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection; * * //Now create a MinHasher object to minhash each of the documents created above * //using 300 unique hashing functions. * //MinHasher minHasher = new MinHasher(500, 5); * Console.WriteLine("\r\nGenerating MinHash signatures ... "); * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2); * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2); */ Console.ReadKey(); }
static void processNumbersTest3(bool sample = false, int sample_size = 1000) { int numHashFunctions = 128; int universeSize = 1000; double simThreshold = 0.65; double atn = 0.05; MinHasher2 mh2 = new MinHasher2(numHashFunctions, simThreshold); NumberDocumentCreator numDocCreator2 = new NumberDocumentCreator(10, universeSize); int[] a1 = numDocCreator2.createDocument(universeSize); int[] a2 = numDocCreator2.createDocument(universeSize); Console.WriteLine("Actual jaccaard: " + MinHasher2.calculateJaccard(a1, a2)); Console.WriteLine("MinHash jaccaard: " + MinHasher2.calculateJaccard(mh2.getMinHashSignature(a1), mh2.getMinHashSignature(a2))); return; MinHasher3 mh = new MinHasher3(universeSize, numHashFunctions); MinHasher_Buckets3 mhb = new MinHasher_Buckets3(mh, simThreshold, atn); NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, universeSize); List <int> s1 = numDocCreator.createDocument(universeSize).ToList(); List <int> s2 = numDocCreator.createDocument(universeSize).ToList(); Console.WriteLine("Actual jaccaard: " + Jaccard.Calc(s1, s2)); Console.WriteLine("MinHash jaccaard: " + Jaccard.Calc(mh.GetMinHash(s1), mh.GetMinHash(s2))); return; Dictionary <int, List <int> > wordList = numDocCreator.documentCollectionList; //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); Dictionary <int, List <uint> > docMinhashes = mhb.createMinhashCollection(wordList); //minHasher.createMinhashCollection(wordList); double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000); * * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection; * * //Now create a MinHasher object to minhash each of the documents created above * //using 300 unique hashing functions. * //MinHasher minHasher = new MinHasher(500, 5); * Console.WriteLine("\r\nGenerating MinHash signatures ... "); * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2); * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2); */ Console.ReadKey(); }