예제 #1
0
        public static void Main(string[] args)
        {
            blooms.Clear();
            bloomsFu.Clear();
            _wordIdf.Clear();
            List <List <string> > stemmedDocs;
            List <string>         vocabulary;

            FileList = SchemeProcess.GenerateFileList().ToArray();

            stopwatch.Restart();
            vocabulary = SchemeProcess.GetVocabulary(FileList, out stemmedDocs, 0);
            stopwatch.Stop();
            Console.WriteLine(stopwatch.Elapsed.TotalMilliseconds);


            if (_wordIdf.Count == 0)
            {
                // 设置idf的变量,以供生成trapdoor时使用
                foreach (var term in vocabulary)
                {
                    _wordIdf[term] = stemmedDocs.Where(d => d.Contains(term)).Count();
                }
            }


            //my scheme
            stopwatch.Restart();
            foreach (List <string> stemDoc in stemmedDocs)
            {
                double[] bloom = new double[8000];
                int[]    count = new int[8000];

                HashSet <string> stemSet = new HashSet <string>(stemDoc);

                foreach (string stem in stemSet)
                {
                    double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count;
                    //int[] vector = new int[1352];
                    var biList = MyScheme.TransformKeywordsToBiGram(stem);
                    //var uniList = FuScheme.TransformKeywordsToUniGram(stem);
                    //var index = SchemeProcess.GenerateVector(biList);
                    var index = MyScheme.BiGramToVector(biList);
                    //var s = string.Join("", index);
                    foreach (int i in _mh.getMinHashSignatures(index))
                    {
                        if (i >= 8000)
                        {
                            continue;
                        }
                        if (bloom[i] == 0)
                        {
                            bloom[i] = tf;
                            count[i]++;
                        }
                        else
                        {
                            bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]);
                        }
                    }
                    //foreach (string s in uniList)
                    //{
                    //    foreach (int i in _mh.getMinHashSignatures(s))
                    //    {
                    //        if (i >= 8000) continue;
                    //        if (bloomFu[i] == 0)
                    //        {
                    //            bloomFu[i] = tf;
                    //            countFu[i]++;
                    //        }
                    //        else
                    //        {
                    //            bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]);
                    //        }
                    //    }
                    //}
                }
                blooms.Add(bloom);
                //bloomsFu.Add(bloomFu);
            }
            stopwatch.Stop();
            Console.WriteLine($"Bi-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}");


            ///Fu's scheme
            stopwatch.Restart();
            foreach (List <string> stemDoc in stemmedDocs)
            {
                double[] bloomFu = new double[8000];
                int[]    countFu = new int[8000];

                HashSet <string> stemSet = new HashSet <string>(stemDoc);

                foreach (string stem in stemSet)
                {
                    double tf = (double)stemDoc.Where(d => d == stem).Count() / (double)stemDoc.Count;
                    //int[] vector = new int[1352];
                    //var biList = MyScheme.TransformKeywordsToBiGram(stem);
                    var uniList = FuScheme.TransformKeywordsToUniGram(stem);
                    var index   = FuScheme.UniGramToVector(uniList);
                    //var index = SchemeProcess.GenerateVector(biList);
                    //foreach (string s in biList)
                    //{
                    //    foreach (int i in _mh.getMinHashSignatures(s))
                    //    {
                    //        if (i >= 8000) continue;
                    //        if (bloom[i] == 0)
                    //        {
                    //            bloom[i] = tf;
                    //            count[i]++;
                    //        }
                    //        else
                    //        {
                    //            bloom[i] = (bloom[i] * count[i] + tf) / (++count[i]);
                    //        }
                    //    }
                    //}
                    foreach (int i in _mh.getMinHashSignatures(index))
                    {
                        if (i >= 8000)
                        {
                            continue;
                        }
                        if (bloomFu[i] == 0)
                        {
                            bloomFu[i] = tf;
                            countFu[i]++;
                        }
                        else
                        {
                            bloomFu[i] = (bloomFu[i] * countFu[i] + tf) / (++countFu[i]);
                        }
                    }
                }
                //blooms.Add(bloom);
                bloomsFu.Add(bloomFu);
            }
            stopwatch.Stop();
            Console.WriteLine($"Uni-Gram生成索引所需时间为:{stopwatch.Elapsed.TotalMilliseconds}");
            Console.ReadLine();

            threshold = CaculateThreshold(_queryString);

            QueryWithMyScheme(_queryString);
            QueryWithFuScheme(_queryString);
        }
예제 #2
0
        public void TestIntMinHash()
        {
            MinHash _mh = new MinHash(1000, 100);

            double[] bloom  = new double[10000];
            int[]    count  = new int[10000];
            double[] bloom1 = new double[10000];
            int[]    count1 = new int[10000];

            //var biList1 = SchemeProcess.TransformKeywordsToBiGram("cat");
            //var index1 = SchemeProcess.GenerateVector(biList1);
            //var res1 = _mh.getMinHashSignatures("ca1");

            int len = 0;

            List <string> stemmedDoc;

            var stemSet  = SchemeProcess.GetVocabulary("my name is zjw", out stemmedDoc, 0);
            var stemSet1 = SchemeProcess.GetVocabulary("my name is wrm", out stemmedDoc, 0);

            foreach (string stem in stemSet)
            {
                var biList = MyScheme.TransformKeywordsToBiGram(stem);
                //var index = SchemeProcess.GenerateVector(biList);
                foreach (string s in biList)
                {
                    foreach (int i in _mh.getMinHashSignatures(s))
                    {
                        if (i >= 10000)
                        {
                            continue;
                        }
                        if (bloom[i] == 0)
                        {
                            bloom[i] = 1;
                            count[i]++;
                        }
                        else
                        {
                            bloom[i] = (bloom[i] * count[i] + 1) / (++count[i]);
                        }
                    }
                }
            }

            foreach (string stem in stemSet1)
            {
                var biList = MyScheme.TransformKeywordsToBiGram(stem);
                //var index = SchemeProcess.GenerateVector(biList);
                foreach (string s in biList)
                {
                    foreach (int i in _mh.getMinHashSignatures(s))
                    {
                        if (i >= 1000)
                        {
                            continue;
                        }
                        if (bloom1[i] == 0)
                        {
                            bloom1[i] = 1;
                            count1[i]++;
                        }
                        else
                        {
                            bloom1[i] = (bloom1[i] * count1[i] + 1) / (++count1[i]);
                        }
                    }
                }
            }

            for (int i = 0; i < bloom.Length; i++)
            {
                if (bloom[i] == bloom1[i] && bloom[i].Equals(1))
                {
                    len++;
                }
                Console.Write($"{bloom[i]}         ");
                Console.WriteLine(bloom1[i]);
            }
            Console.WriteLine(len);
        }