public void ProductionTest() { var site = new Site() { Domain = "www.usashopcn.com" }; var requestMessage = NewTestRequestMessage(site); var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20); var lineScheduler = new SequenceScheduler(bloomFilter); lineScheduler.Push(requestMessage); Trace.WriteLine("CurrentQueueCount:" + lineScheduler.CurrentQueueCount()); Assert.IsTrue(lineScheduler.CurrentQueueCount() == 1, "添加消息失败"); lineScheduler.Dispose(); }
static void RunSpider() { var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20); _spider = new Spider(new SequenceScheduler(bloomFilter)); var downloaders = new List<IDownloader> { new HttpClientDownloader(4) }; _spider.RegisterDownloader(downloaders); _spider.RegisterPageAnalyzer<UsashopcnPageAnalyzer>(UsashopcnPageAnalyzer.SiteId); _spider.RegisterResultPipeModule(new ConsoleModule(0, 20, 400, 500, true, true)); _spider.Start(TopicType.StaticHtml, SiteIndex.Usashopcn, "http://www.usashopcn.com/"); var statusTimer = new Timer(spider => { Console.WriteLine(((Spider) spider).RunStatusInfo()); }, _spider, 0, 2000); }
public void RunSpider() { var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20); _spider = new Spider(new SequenceScheduler(bloomFilter)); //var downloaders = new List<IDownloader> { new FakeDownloader(4) }; var downloaders = new List<IDownloader> { new HttpClientDownloader(4) }; _spider.RegisterDownloader(downloaders); _spider.RegisterPageAnalyzer<UsashopcnPageAnalyzer>(UsashopcnPageAnalyzer.SiteId); _spider.RegisterResultPipeModule(new ConsoleModule(500, 0, 400, 500, true, true)); _spider.Start(TopicType.StaticHtml, SiteIndex.Usashopcn, GetUrls()); var statusTimer = new Timer(spider => { var status = ((Spider) spider).RunStatusInfo(); Console.WriteLine(String.Format("QueueCount:{0}, TaskCount={1}, ConsumeTotal:{2}, ResultTotal:{3}", status.QueueCount, status.TaskCount, status.ConsumeTotal, status.ResultTotal)); }, _spider, 2000, 2000); }
public void TestMethod1() { var dateSize = 1000 * 1000; var bloom = new MemoryBloomFilter <string>(dateSize, 1000 * 1000 * 1000); var list1 = new List <string>(); for (int i = 0; i < dateSize; i++) { list1.Add(Guid.NewGuid().ToString()); } var list2 = new List <string>(); for (int i = 0; i < dateSize; i++) { list2.Add(Guid.NewGuid().ToString()); } list1.ForEach(e => bloom.Add(e)); list1.ForEach(l => { Assert.IsTrue(bloom.Contains(l), l + " 必须包含在集合中"); }); var falseNumber = 0; list2.ForEach(l => { if (bloom.Contains(l)) { falseNumber++; } //Assert.IsFalse(bloom.Contains(l), l + " 并不包含在集合中"); }); Trace.WriteLine(String.Format("SpaceSize:{0}, DataSize:{1}, HashNumber:{2}", bloom.SpaceSize, bloom.DataSize, bloom.NumberOfHashes)); Trace.WriteLine(String.Format("FalsePositive Number:{0}, FalsePositiveProbability:{1}, Real FalsePositiveProbability:{2}", falseNumber, bloom.FalsePositiveRate, falseNumber * 1.00 / dateSize)); }
private async Task TestHash(IEnumerable <Func <string, ulong> > hashFuncs) { var bloom = new MemoryBloomFilter(hashFuncs, 5000000); var urls = await File.ReadAllLinesAsync("urls.txt", Encoding.UTF8); var duplication = false; foreach (var url in urls) { if (!bloom.TryAdd(url)) { duplication = true; _output.WriteLine(url); } } Assert.False(duplication); }
public void ConsumeTest() { var site = new Site() {Domain = "www.usashopcn.com"}; var requestMessages = NewTestRequestMessages(site); var bloomFilter = new MemoryBloomFilter<string>(1000 * 10, 1000 * 10 * 20); var lineScheduler = new SequenceScheduler(bloomFilter); lineScheduler.Push(requestMessages); Trace.WriteLine("CurrentQueueCount:" + lineScheduler.CurrentQueueCount()); var resultPipeline = new ResultPipeline(); resultPipeline.RegisterModule(new TestPipelineMoudle()); var pageAnalyzers = new List<KeyValuePair<string, Type>>(); pageAnalyzers.Add(new KeyValuePair<string, Type>(site.Domain, typeof(TestPageAnalyzer))); var consumerBroker = new ConsumerBroker(2, lineScheduler, new List<IDownloader>() {new TestDownloader()}, new ResultPipeline(), pageAnalyzers); consumerBroker.Start(); var timer = new Timer((state) => { Trace.WriteLine("Timer结束:"); Assert.IsTrue(lineScheduler.CurrentQueueCount() == 0, "添加消息失败"); }, null, 20000, 0); //lineScheduler.Dispose(); }