private static double CaculateThreshold(string s) { double res = 0; List <string> stemmedDoc; List <string> vocabulary; double[] tfs = new double[8000]; double[] bloom = new double[8000]; vocabulary = SchemeProcess.GetVocabulary(s, out stemmedDoc, 0); foreach (string stem in vocabulary) { double tf = 0.15; var biList = MyScheme.TransformKeywordsToBiGram(stem); var index = MyScheme.BiGramToVector(biList); //var s = string.Join("", index); foreach (int i in _mh.getMinHashSignatures(index)) { if (i >= 8000) { continue; } if (tfs[i] == 0) { tfs[i] = tf; } if (bloom[i] == 0) { if (!_wordIdf.ContainsKey(stem)) { bloom[i] = 0; } else { bloom[i] = _wordIdf[stem]; } bloom[i] = Math.Log((double)FileList.Length / (bloom[i] + 1)); } } } for (int i = 0; i < 8000; i++) { res += tfs[i] * bloom[i]; } return(res); }
public static void Main(string[] args) { blooms.Clear(); bloomsFu.Clear(); _wordIdf.Clear(); List <List <string> > stemmedDocs; List <string> vocabulary; FileList = SchemeProcess.GenerateFileList().ToArray(); stopwatch.Restart(); vocabulary = SchemeProcess.GetVocabulary(FileList, out stemmedDocs, 0); stopwatch.Stop(); Console.WriteLine(stopwatch.Elapsed.TotalMilliseconds); if (_wordIdf.Count == 0) { // 设置idf的变量,以供生成trapdoor时使用 foreach (var term in vocabulary) { _wordIdf[term] = stemmedDocs.Where(d => d.Contains(term)).Count(); } } //my scheme stopwatch.Restart(); foreach (List <string> stemDoc in stemmedDocs) { double[] bloom = new double[8000]; int[] count = new int[8000]; HashSet <string> stemSet = new HashSet <string>(stemDoc); foreach (string stem in stemSet) { double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count; //int[] vector = new int[1352]; var biList = MyScheme.TransformKeywordsToBiGram(stem); //var uniList = FuScheme.TransformKeywordsToUniGram(stem); //var index = SchemeProcess.GenerateVector(biList); var index = MyScheme.BiGramToVector(biList); //var s = string.Join("", index); foreach (int i in _mh.getMinHashSignatures(index)) { if (i >= 8000) { continue; } if (bloom[i] == 0) { bloom[i] = tf; count[i]++; } else { bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]); } } //foreach (string s in uniList) //{ // foreach (int i in _mh.getMinHashSignatures(s)) // { // if (i >= 8000) continue; // if (bloomFu[i] == 0) // { // bloomFu[i] = tf; // countFu[i]++; // } // else // { // bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]); // } // } //} } blooms.Add(bloom); //bloomsFu.Add(bloomFu); } stopwatch.Stop(); Console.WriteLine($"Bi-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}"); ///Fu's scheme stopwatch.Restart(); foreach (List <string> stemDoc in stemmedDocs) { double[] bloomFu = new double[8000]; int[] countFu = new int[8000]; HashSet <string> stemSet = new HashSet <string>(stemDoc); foreach (string stem in stemSet) { double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count; //int[] vector = new int[1352]; //var biList = MyScheme.TransformKeywordsToBiGram(stem); var uniList = FuScheme.TransformKeywordsToUniGram(stem); var index = FuScheme.UniGramToVector(uniList); //var index = SchemeProcess.GenerateVector(biList); //foreach (string s in biList) //{ // foreach (int i in _mh.getMinHashSignatures(s)) // { // if (i >= 8000) continue; // if (bloom[i] == 0) // { // bloom[i] = tf; // count[i]++; // } // else // { // bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]); // } // } //} foreach (int i in _mh.getMinHashSignatures(index)) { if (i >= 8000) { continue; } if (bloomFu[i] == 0) { bloomFu[i] = tf; countFu[i]++; } else { bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]); } } } //blooms.Add(bloom); bloomsFu.Add(bloomFu); } stopwatch.Stop(); Console.WriteLine($"Uni-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}"); Console.ReadLine(); threshold = CaculateThreshold(_queryString); QueryWithMyScheme(_queryString); QueryWithFuScheme(_queryString); }
/// <summary> /// 用本人方案进行检索的结果 /// </summary> /// <param name="s"></param> private static void QueryWithMyScheme(string s) { Console.WriteLine($"当前使用方案为本人的"); Console.WriteLine($"当前检索文本为{s}"); List <string> stemmedDoc; List <string> vocabulary; vocabulary = SchemeProcess.GetVocabulary(s, out stemmedDoc, 0); double[] bloom = new double[8000]; int[] count = new int[8000]; foreach (string stem in vocabulary) { int[] vector = new int[8000]; var biList = MyScheme.TransformKeywordsToBiGram(stem); var index = MyScheme.BiGramToVector(biList); //string bi = string.Join("", index); //var index = SchemeProcess.GenerateVector(biList); foreach (int i in _mh.getMinHashSignatures(index)) { if (i >= 8000) { continue; } if (bloom[i] == 0) { if (!_wordIdf.ContainsKey(stem)) { bloom[i] = 0; } else { bloom[i] = _wordIdf[stem]; } bloom[i] = Math.Log((double)FileList.Length / (bloom[i] + 1)); count[i]++; } else { double temp = 0; if (!_wordIdf.ContainsKey(stem)) { temp = 0; } else { temp = _wordIdf[stem]; } temp = Math.Log((double)FileList.Length / (temp + 1)); bloom[i] = (bloom[i] * count[i] + temp) / (double)(++count[i]); } } } for (int i = 0; i < blooms.Count; i++) { double score = 0; for (int j = 0; j < 8000; j++) { score += blooms[i][j] * bloom[j]; } if (score >= threshold) { _myScheme.Add(FileList[i], score); } //_myScheme[score] = FileList[i]; //res.Add(score, FileList[i]); //Console.WriteLine($"第{i}个文本为{FileList[i]};"); //Console.WriteLine($"分数为{score}"); } var dicSort = from objDic in _myScheme orderby objDic.Value descending select objDic; int rank = 1; foreach (KeyValuePair <string, double> item in dicSort) { Console.WriteLine($"{rank++}、文本为{item.Key};"); Console.WriteLine($" 分数为{item.Value};"); } Console.ReadLine(); }