public static Dictionary <string, Tuple <T1, T1, double> > getActualPairsDictionary <T1>(Dictionary <T1, List <int> > wordList, double threshold) { Dictionary <string, Tuple <T1, T1, double> > pairsDictionary = new Dictionary <string, Tuple <T1, T1, double> >(); List <T1> docList = wordList.Keys.ToList(); int i, j; string sum; double jaccard; for (i = 0; i < docList.Count; i++) { for (j = i + 1; j < docList.Count; j++) { //sum = docList[i] + "#" + docList[j]; sum = getKeyFromPair(docList[i], docList[j]); if (!pairsDictionary.ContainsKey(sum)) { jaccard = Jaccard.Calc(wordList[docList[i]], wordList[docList[j]]); if (jaccard >= threshold) { pairsDictionary.Add(sum, new Tuple <T1, T1, double>(docList[i], docList[j], jaccard)); } } } } return(pairsDictionary); }
public static double calculateMinHashFunctionsAccuracy <T1>(Dictionary <T1, List <int> > wordListActual, Dictionary <T1, List <uint> > wordListMinHash) { List <T1> docList = wordListActual.Keys.ToList(); int i, j; double jaccard_actual, jaccard_minhash; double total_diff_perc = 0; double diff_perc; int pair_count = 0; for (i = 0; i < docList.Count; i++) { for (j = i + 1; j < docList.Count; j++) { jaccard_actual = Jaccard.Calc(wordListActual[docList[i]], wordListActual[docList[j]]); if (jaccard_actual > 0) { jaccard_minhash = Jaccard.Calc(wordListMinHash[docList[i]], wordListMinHash[docList[j]]); diff_perc = (Math.Abs(jaccard_minhash - jaccard_actual) / jaccard_actual) * 100; total_diff_perc += diff_perc; pair_count++; } } } double avg_diff_perc = total_diff_perc / pair_count; Console.WriteLine("Average diff from Actual and MinHash Jaccard is: " + avg_diff_perc + " %"); return(avg_diff_perc); }
public static void JaccardTest1() { Dictionary <int, string> wordDict = new Dictionary <int, string>(); wordDict.Add(1, "Word1"); wordDict.Add(2, "Word2"); wordDict.Add(3, "Word3"); wordDict.Add(4, "Word4"); List <int> doc1 = new List <int>(); doc1.Add(2); doc1.Add(3); doc1.Add(4); doc1.Add(2); List <int> doc2 = new List <int>(); doc2.Add(1); doc2.Add(5); doc2.Add(4); doc2.Add(2); List <int> doc3 = new List <int>(); doc3.Add(1); Console.WriteLine("Jaccard: " + Jaccard.Calc(doc1, doc2)); Console.WriteLine("Jaccard: " + Jaccard.Calc(doc1, doc1)); Console.WriteLine("Jaccard: " + Jaccard.Calc(doc1, doc3)); }
static void processNumbersTest3(bool sample = false, int sample_size = 1000) { int numHashFunctions = 128; int universeSize = 1000; double simThreshold = 0.65; double atn = 0.05; MinHasher2 mh2 = new MinHasher2(numHashFunctions, simThreshold); NumberDocumentCreator numDocCreator2 = new NumberDocumentCreator(10, universeSize); int[] a1 = numDocCreator2.createDocument(universeSize); int[] a2 = numDocCreator2.createDocument(universeSize); Console.WriteLine("Actual jaccaard: " + MinHasher2.calculateJaccard(a1, a2)); Console.WriteLine("MinHash jaccaard: " + MinHasher2.calculateJaccard(mh2.getMinHashSignature(a1), mh2.getMinHashSignature(a2))); return; MinHasher3 mh = new MinHasher3(universeSize, numHashFunctions); MinHasher_Buckets3 mhb = new MinHasher_Buckets3(mh, simThreshold, atn); NumberDocumentCreator numDocCreator = new NumberDocumentCreator(10, universeSize); List <int> s1 = numDocCreator.createDocument(universeSize).ToList(); List <int> s2 = numDocCreator.createDocument(universeSize).ToList(); Console.WriteLine("Actual jaccaard: " + Jaccard.Calc(s1, s2)); Console.WriteLine("MinHash jaccaard: " + Jaccard.Calc(mh.GetMinHash(s1), mh.GetMinHash(s2))); return; Dictionary <int, List <int> > wordList = numDocCreator.documentCollectionList; //Now create a MinHasher object to minhash each of the documents created above //using 300 unique hashing functions. //MinHasher minHasher = new MinHasher(500, 5); Console.WriteLine("\r\nGenerating MinHash signatures ... "); Dictionary <int, List <uint> > docMinhashes = mhb.createMinhashCollection(wordList); //minHasher.createMinhashCollection(wordList); double avg_diff_perc_from_actual_and_minhash_jaccard = Util.calculateMinHashFunctionsAccuracy(wordList, docMinhashes); /*StringDocumentCreator strDocCreator = new StringDocumentCreator(100, 10000); * * Dictionary<int, string[]> wordList2 = strDocCreator.documentCollection; * * //Now create a MinHasher object to minhash each of the documents created above * //using 300 unique hashing functions. * //MinHasher minHasher = new MinHasher(500, 5); * Console.WriteLine("\r\nGenerating MinHash signatures ... "); * Dictionary<int, int[]> docMinhashes2 = minHasher.createMinhashCollection(wordList2); * double avg_diff_perc_from_actual_and_minhash_jaccard2 = Util.calculateMinHashFunctionsAccuracy(wordList2, docMinhashes2); */ Console.ReadKey(); }
/* * Graph::generateCommonPairs * Generate the list of Vertex pairs that share common properties. The pairs in this list will be input to the OurSim calculations * We are doing this to recover from n square complexity */ public Dictionary <string, Tuple <T1, T1, double> > generateVertexPairs <T1, T>(Dictionary <string, HashSet <T1> > m_lshBuckets, Dictionary <T1, List <uint> > docMinhashes, Dictionary <T1, List <int> > wordList, bool exclude_sim_under_threshold, string output_file_name) { //Dictionary<string, HashSet<int>> m_lshBuckets = new Dictionary<string, HashSet<int>>(); Dictionary <string, Tuple <T1, T1, double> > pairsDictionary = new Dictionary <string, Tuple <T1, T1, double> >(); List <T1> docList; string sum; int loopCount = 0; double jaccard; int bucketIndex = 0; StreamWriter wr = null; string sep = " #-# "; string temp_file_name = output_file_name + ".temp"; if (output_file_name != null) { wr = new StreamWriter(temp_file_name); //write the pairs to a file wr.WriteLine("-common_pairs-"); wr.WriteLine(string.Format("vid1{0}vid2{0}minhash_sim", sep)); } foreach (var bucket in m_lshBuckets) { bucketIndex++; if (bucket.Value.Count <= 1) { continue; } docList = bucket.Value.ToList(); int i = 0; int j = i + 1; for (i = 0; i < docList.Count; i++) { for (j = i + 1; j < docList.Count; j++) { //sum = docList[i] + "#" + docList[j]; sum = Util.getKeyFromPair(docList[i], docList[j]); if (!pairsDictionary.ContainsKey(sum)) { //jaccard = calculateJaccard(docMinhashes[docList[i]], docMinhashes[docList[j]]); jaccard = Jaccard.Calc(wordList[docList[i]], wordList[docList[j]]); if (!exclude_sim_under_threshold || jaccard >= sim_threshold) { pairsDictionary.Add(sum, new Tuple <T1, T1, double>(docList[i], docList[j], jaccard)); if (output_file_name != null) { wr.WriteLine(docList[i] + sep + docList[j] + sep + jaccard); } } } loopCount++; } } } Console.WriteLine("\r\nBucket generating candidate pairs complexity: " + loopCount); if (wr != null) { wr.Close(); wr = new StreamWriter(output_file_name); wr.WriteLine(pairsDictionary.Count); //prepending the size of the pairs. needed for c++ vector space allocation StreamReader rd = new StreamReader(temp_file_name); string buf; while ((buf = rd.ReadLine()) != null) { wr.WriteLine(buf); } rd.Close(); wr.Close(); File.Delete(temp_file_name); } return(pairsDictionary); }
// Calculates the similarity of two lists of min hash values. Approximately Numerically equivilant to Jaccard Similarity public double Similarity(List <uint> l1, List <uint> l2) { Jaccard jac = new Jaccard(); return(Jaccard.Calc(l1, l2)); }
public static void MinHasher3TestFunc1() { List <int> inums1 = new List <int>(); inums1.Add(10); inums1.Add(8); inums1.Add(11); inums1.Add(13); inums1.Add(2); inums1.Add(17); inums1.Add(3); inums1.Add(1); inums1.Add(19); inums1.Add(11); inums1.Add(100); inums1.Add(82); inums1.Add(115); inums1.Add(13); inums1.Add(2); inums1.Add(107); inums1.Add(3); inums1.Add(1); inums1.Add(19); inums1.Add(110); inums1.Add(10); inums1.Add(8); inums1.Add(110); inums1.Add(131); inums1.Add(2); inums1.Add(173); inums1.Add(3); inums1.Add(1); inums1.Add(19); inums1.Add(114); inums1.Add(10); inums1.Add(8); inums1.Add(11); inums1.Add(13); inums1.Add(2); inums1.Add(17); inums1.Add(3); inums1.Add(1); inums1.Add(19); inums1.Add(115); inums1.Add(10); inums1.Add(8); inums1.Add(11); inums1.Add(133); inums1.Add(2); inums1.Add(17); inums1.Add(3); inums1.Add(1); inums1.Add(19); inums1.Add(11); inums1.Add(10); inums1.Add(8); inums1.Add(11); inums1.Add(13); inums1.Add(2); inums1.Add(17); inums1.Add(3); inums1.Add(1); inums1.Add(19); inums1.Add(171); List <int> inums2 = new List <int>(); inums2.Add(1); inums2.Add(2); inums2.Add(5); inums2.Add(9); inums2.Add(12); inums2.Add(17); inums2.Add(13); inums2.Add(11); inums2.Add(9); inums2.Add(10); inums2.Add(1); inums2.Add(2); inums2.Add(5); inums2.Add(9); inums2.Add(12); inums2.Add(17); inums2.Add(13); inums2.Add(11); inums2.Add(9); inums2.Add(10); inums2.Add(1); inums2.Add(2); inums2.Add(5); inums2.Add(9); inums2.Add(12); inums2.Add(17); inums2.Add(13); inums2.Add(151); inums2.Add(9); inums2.Add(510); inums2.Add(1); inums2.Add(2); inums2.Add(5); inums2.Add(9); inums2.Add(12); inums2.Add(17); inums2.Add(13); inums2.Add(11); inums2.Add(95); inums2.Add(10); inums2.Add(1); inums2.Add(23); inums2.Add(5); inums2.Add(9); inums2.Add(162); inums2.Add(17); inums2.Add(13); inums2.Add(11); inums2.Add(93); inums2.Add(10); inums2.Add(19); inums2.Add(23); inums2.Add(5); inums2.Add(9); inums2.Add(12); inums2.Add(17); inums2.Add(13); inums2.Add(141); inums2.Add(94); inums2.Add(10); int universeSize = Jaccard.unionSize(inums1, inums2); MinHasher3 mh = new MinHasher3(universeSize, 135); List <uint> hvs1 = mh.GetMinHash(inums1).ToList(); List <uint> hvs2 = mh.GetMinHash(inums2).ToList(); Console.WriteLine(); Console.WriteLine("Estimated similarity: " + mh.Similarity(hvs1, hvs2)); Console.WriteLine("Jaccard similarity: " + Jaccard.Calc(inums1, inums2)); Console.WriteLine("done"); }