public static void Main(string[] args) { blooms.Clear(); bloomsFu.Clear(); _wordIdf.Clear(); List <List <string> > stemmedDocs; List <string> vocabulary; FileList = SchemeProcess.GenerateFileList().ToArray(); stopwatch.Restart(); vocabulary = SchemeProcess.GetVocabulary(FileList, out stemmedDocs, 0); stopwatch.Stop(); Console.WriteLine(stopwatch.Elapsed.TotalMilliseconds); if (_wordIdf.Count == 0) { // 设置idf的变量,以供生成trapdoor时使用 foreach (var term in vocabulary) { _wordIdf[term] = stemmedDocs.Where(d => d.Contains(term)).Count(); } } //my scheme stopwatch.Restart(); foreach (List <string> stemDoc in stemmedDocs) { double[] bloom = new double[8000]; int[] count = new int[8000]; HashSet <string> stemSet = new HashSet <string>(stemDoc); foreach (string stem in stemSet) { double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count; //int[] vector = new int[1352]; var biList = MyScheme.TransformKeywordsToBiGram(stem); //var uniList = FuScheme.TransformKeywordsToUniGram(stem); //var index = SchemeProcess.GenerateVector(biList); var index = MyScheme.BiGramToVector(biList); //var s = string.Join("", index); foreach (int i in _mh.getMinHashSignatures(index)) { if (i >= 8000) { continue; } if (bloom[i] == 0) { bloom[i] = tf; count[i]++; } else { bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]); } } //foreach (string s in uniList) //{ // foreach (int i in _mh.getMinHashSignatures(s)) // { // if (i >= 8000) continue; // if (bloomFu[i] == 0) // { // bloomFu[i] = tf; // countFu[i]++; // } // else // { // bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]); // } // } //} } blooms.Add(bloom); //bloomsFu.Add(bloomFu); } stopwatch.Stop(); Console.WriteLine($"Bi-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}"); ///Fu's scheme stopwatch.Restart(); foreach (List <string> stemDoc in stemmedDocs) { double[] bloomFu = new double[8000]; int[] countFu = new int[8000]; HashSet <string> stemSet = new HashSet <string>(stemDoc); foreach (string stem in stemSet) { double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count; //int[] vector = new int[1352]; //var biList = MyScheme.TransformKeywordsToBiGram(stem); var uniList = FuScheme.TransformKeywordsToUniGram(stem); var index = FuScheme.UniGramToVector(uniList); //var index = SchemeProcess.GenerateVector(biList); //foreach (string s in biList) //{ // foreach (int i in _mh.getMinHashSignatures(s)) // { // if (i >= 8000) continue; // if (bloom[i] == 0) // { // bloom[i] = tf; // count[i]++; // } // else // { // bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]); // } // } //} foreach (int i in _mh.getMinHashSignatures(index)) { if (i >= 8000) { continue; } if (bloomFu[i] == 0) { bloomFu[i] = tf; countFu[i]++; } else { bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]); } } } //blooms.Add(bloom); bloomsFu.Add(bloomFu); } stopwatch.Stop(); Console.WriteLine($"Uni-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}"); Console.ReadLine(); threshold = CaculateThreshold(_queryString); QueryWithMyScheme(_queryString); QueryWithFuScheme(_queryString); }
public void TestIntMinHash() { MinHash _mh = new MinHash(1000, 100); double[] bloom = new double[10000]; int[] count = new int[10000]; double[] bloom1 = new double[10000]; int[] count1 = new int[10000]; //var biList1 = SchemeProcess.TransformKeywordsToBiGram("cat"); //var index1 = SchemeProcess.GenerateVector(biList1); //var res1 = _mh.getMinHashSignatures("ca1"); int len = 0; List <string> stemmedDoc; var stemSet = SchemeProcess.GetVocabulary("my name is zjw", out stemmedDoc, 0); var stemSet1 = SchemeProcess.GetVocabulary("my name is wrm", out stemmedDoc, 0); foreach (string stem in stemSet) { var biList = MyScheme.TransformKeywordsToBiGram(stem); //var index = SchemeProcess.GenerateVector(biList); foreach (string s in biList) { foreach (int i in _mh.getMinHashSignatures(s)) { if (i >= 10000) { continue; } if (bloom[i] == 0) { bloom[i] = 1; count[i]++; } else { bloom[i] = (bloom[i] * count[i] + 1) / (++count[i]); } } } } foreach (string stem in stemSet1) { var biList = MyScheme.TransformKeywordsToBiGram(stem); //var index = SchemeProcess.GenerateVector(biList); foreach (string s in biList) { foreach (int i in _mh.getMinHashSignatures(s)) { if (i >= 1000) { continue; } if (bloom1[i] == 0) { bloom1[i] = 1; count1[i]++; } else { bloom1[i] = (bloom1[i] * count1[i] + 1) / (++count1[i]); } } } } for (int i = 0; i < bloom.Length; i++) { if (bloom[i] == bloom1[i] && bloom[i].Equals(1)) { len++; } Console.Write($"{bloom[i]} "); Console.WriteLine(bloom1[i]); } Console.WriteLine(len); }