示例#1
0
 public void ProductionTest()
 {
     var site = new Site() { Domain = "www.usashopcn.com" };
     var requestMessage = NewTestRequestMessage(site);
     var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20);
     var lineScheduler = new SequenceScheduler(bloomFilter);
     lineScheduler.Push(requestMessage);
     Trace.WriteLine("CurrentQueueCount:" + lineScheduler.CurrentQueueCount());
     Assert.IsTrue(lineScheduler.CurrentQueueCount() == 1, "添加消息失败");
     lineScheduler.Dispose();
 }
示例#2
0
        static void RunSpider()
        {
            var bloomFilter  = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20);
            _spider = new Spider(new SequenceScheduler(bloomFilter));
            var downloaders = new List<IDownloader> { new HttpClientDownloader(4) };
            _spider.RegisterDownloader(downloaders);
            _spider.RegisterPageAnalyzer<UsashopcnPageAnalyzer>(UsashopcnPageAnalyzer.SiteId);
            _spider.RegisterResultPipeModule(new ConsoleModule(0, 20, 400, 500, true, true));

            _spider.Start(TopicType.StaticHtml, SiteIndex.Usashopcn, "http://www.usashopcn.com/");

            var statusTimer = new Timer(spider => { Console.WriteLine(((Spider) spider).RunStatusInfo()); }, _spider, 0, 2000);
        }
示例#3
0
        public void RunSpider()
        {
            var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20);
            _spider = new Spider(new SequenceScheduler(bloomFilter));
            //var downloaders = new List<IDownloader> { new FakeDownloader(4) };
            var downloaders = new List<IDownloader> { new HttpClientDownloader(4) };
            _spider.RegisterDownloader(downloaders);
            _spider.RegisterPageAnalyzer<UsashopcnPageAnalyzer>(UsashopcnPageAnalyzer.SiteId);
            _spider.RegisterResultPipeModule(new ConsoleModule(500, 0, 400, 500, true, true));

            _spider.Start(TopicType.StaticHtml, SiteIndex.Usashopcn, GetUrls());

            var statusTimer = new Timer(spider =>
            {
                var status = ((Spider) spider).RunStatusInfo();
                Console.WriteLine(String.Format("QueueCount:{0}, TaskCount={1}, ConsumeTotal:{2},  ResultTotal:{3}", status.QueueCount, status.TaskCount, status.ConsumeTotal, status.ResultTotal));
            }, _spider, 2000, 2000);
        }
        public void TestMethod1()
        {
            var dateSize = 1000 * 1000;
            var bloom    = new MemoryBloomFilter <string>(dateSize, 1000 * 1000 * 1000);

            var list1 = new List <string>();

            for (int i = 0; i < dateSize; i++)
            {
                list1.Add(Guid.NewGuid().ToString());
            }

            var list2 = new List <string>();

            for (int i = 0; i < dateSize; i++)
            {
                list2.Add(Guid.NewGuid().ToString());
            }

            list1.ForEach(e => bloom.Add(e));

            list1.ForEach(l =>
            {
                Assert.IsTrue(bloom.Contains(l), l + " 必须包含在集合中");
            });

            var falseNumber = 0;

            list2.ForEach(l =>
            {
                if (bloom.Contains(l))
                {
                    falseNumber++;
                }
                //Assert.IsFalse(bloom.Contains(l), l + " 并不包含在集合中");
            });


            Trace.WriteLine(String.Format("SpaceSize:{0}, DataSize:{1}, HashNumber:{2}", bloom.SpaceSize, bloom.DataSize, bloom.NumberOfHashes));

            Trace.WriteLine(String.Format("FalsePositive Number:{0}, FalsePositiveProbability:{1}, Real FalsePositiveProbability:{2}", falseNumber, bloom.FalsePositiveRate, falseNumber * 1.00 / dateSize));
        }
        private async Task TestHash(IEnumerable <Func <string, ulong> > hashFuncs)
        {
            var bloom = new MemoryBloomFilter(hashFuncs, 5000000);

            var urls = await File.ReadAllLinesAsync("urls.txt", Encoding.UTF8);

            var duplication = false;


            foreach (var url in urls)
            {
                if (!bloom.TryAdd(url))
                {
                    duplication = true;

                    _output.WriteLine(url);
                }
            }

            Assert.False(duplication);
        }
示例#6
0
        public void ConsumeTest()
        {
            var site = new Site() {Domain = "www.usashopcn.com"};

            var requestMessages = NewTestRequestMessages(site);
            var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20);
            var lineScheduler = new SequenceScheduler(bloomFilter);
            lineScheduler.Push(requestMessages);
            Trace.WriteLine("CurrentQueueCount:" + lineScheduler.CurrentQueueCount());

            var resultPipeline = new ResultPipeline();
            resultPipeline.RegisterModule(new TestPipelineMoudle());
            var pageAnalyzers = new List<KeyValuePair<string, Type>>();
            pageAnalyzers.Add(new KeyValuePair<string, Type>(site.Domain, typeof(TestPageAnalyzer)));
            var consumerBroker = new ConsumerBroker(2, lineScheduler, new List<IDownloader>() {new TestDownloader()}, new ResultPipeline(), pageAnalyzers);
            consumerBroker.Start();

            var timer = new Timer((state) =>
            {
                Trace.WriteLine("Timer结束:");
                Assert.IsTrue(lineScheduler.CurrentQueueCount() == 0, "添加消息失败");
            }, null, 20000, 0);
            
            //lineScheduler.Dispose();
        }