Esempio n. 1
0
        public static void Main(string[] args)
        {
            blooms.Clear();
            bloomsFu.Clear();
            _wordIdf.Clear();
            List <List <string> > stemmedDocs;
            List <string>         vocabulary;

            FileList = SchemeProcess.GenerateFileList().ToArray();

            stopwatch.Restart();
            vocabulary = SchemeProcess.GetVocabulary(FileList, out stemmedDocs, 0);
            stopwatch.Stop();
            Console.WriteLine(stopwatch.Elapsed.TotalMilliseconds);


            if (_wordIdf.Count == 0)
            {
                // 设置idf的变量,以供生成trapdoor时使用
                foreach (var term in vocabulary)
                {
                    _wordIdf[term] = stemmedDocs.Where(d => d.Contains(term)).Count();
                }
            }


            //my scheme
            stopwatch.Restart();
            foreach (List <string> stemDoc in stemmedDocs)
            {
                double[] bloom = new double[8000];
                int[]    count = new int[8000];

                HashSet <string> stemSet = new HashSet <string>(stemDoc);

                foreach (string stem in stemSet)
                {
                    double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count;
                    //int[] vector = new int[1352];
                    var biList = MyScheme.TransformKeywordsToBiGram(stem);
                    //var uniList = FuScheme.TransformKeywordsToUniGram(stem);
                    //var index = SchemeProcess.GenerateVector(biList);
                    var index = MyScheme.BiGramToVector(biList);
                    //var s = string.Join("", index);
                    foreach (int i in _mh.getMinHashSignatures(index))
                    {
                        if (i >= 8000)
                        {
                            continue;
                        }
                        if (bloom[i] == 0)
                        {
                            bloom[i] = tf;
                            count[i]++;
                        }
                        else
                        {
                            bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]);
                        }
                    }
                    //foreach (string s in uniList)
                    //{
                    //    foreach (int i in _mh.getMinHashSignatures(s))
                    //    {
                    //        if (i >= 8000) continue;
                    //        if (bloomFu[i] == 0)
                    //        {
                    //            bloomFu[i] = tf;
                    //            countFu[i]++;
                    //        }
                    //        else
                    //        {
                    //            bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]);
                    //        }
                    //    }
                    //}
                }
                blooms.Add(bloom);
                //bloomsFu.Add(bloomFu);
            }
            stopwatch.Stop();
            Console.WriteLine($"Bi-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}");


            ///Fu's scheme
            stopwatch.Restart();
            foreach (List <string> stemDoc in stemmedDocs)
            {
                double[] bloomFu = new double[8000];
                int[]    countFu = new int[8000];

                HashSet <string> stemSet = new HashSet <string>(stemDoc);

                foreach (string stem in stemSet)
                {
                    double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count;
                    //int[] vector = new int[1352];
                    //var biList = MyScheme.TransformKeywordsToBiGram(stem);
                    var uniList = FuScheme.TransformKeywordsToUniGram(stem);
                    var index   = FuScheme.UniGramToVector(uniList);
                    //var index = SchemeProcess.GenerateVector(biList);
                    //foreach (string s in biList)
                    //{
                    //    foreach (int i in _mh.getMinHashSignatures(s))
                    //    {
                    //        if (i >= 8000) continue;
                    //        if (bloom[i] == 0)
                    //        {
                    //            bloom[i] = tf;
                    //            count[i]++;
                    //        }
                    //        else
                    //        {
                    //            bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]);
                    //        }
                    //    }
                    //}
                    foreach (int i in _mh.getMinHashSignatures(index))
                    {
                        if (i >= 8000)
                        {
                            continue;
                        }
                        if (bloomFu[i] == 0)
                        {
                            bloomFu[i] = tf;
                            countFu[i]++;
                        }
                        else
                        {
                            bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]);
                        }
                    }
                }
                //blooms.Add(bloom);
                bloomsFu.Add(bloomFu);
            }
            stopwatch.Stop();
            Console.WriteLine($"Uni-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}");
            Console.ReadLine();

            threshold = CaculateThreshold(_queryString);

            QueryWithMyScheme(_queryString);
            QueryWithFuScheme(_queryString);
        }