public async Task EnqueueAndDequeueQueueBfs() { var ownerId = Guid.NewGuid().ToString("N"); var scheduler = new QueueDistinctBfsScheduler(new HashSetDuplicateRemover(), _hashAlgorithm); await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request("http://www.b.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com") { Owner = ownerId } }); var request = (await scheduler.DequeueAsync()).First(); Assert.Equal("http://www.a.com/", request.RequestUri.ToString()); Assert.Equal(2, scheduler.Total); }
public async Task RetryDownloadTimes() { var spider = LocalSpiderProvider.Value.Create <Spider>(); spider.NewGuidId(); spider.Name = "RetryDownloadTimes"; spider.EmptySleepTime = 15; var scheduler = new QueueDistinctBfsScheduler(); spider.Scheduler = scheduler; spider.AddRequests(new Request("http://www.RetryDownloadTimes.com") { DownloaderType = DownloaderType.Exception, RetryTimes = 5 }); await spider.RunAsync(); var statisticsStore = LocalSpiderProvider.Value.GetRequiredService <IStatisticsStore>(); var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result; Assert.Equal(1, s.Total); Assert.Equal(1, s.Failed); Assert.Equal(0, s.Success); var dss = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result; var ds = dss[0]; Assert.Equal(6, ds.Failed); Assert.Equal(0, ds.Success); }
public void RetryDownloadTimes() { var spider = SpiderFactory.Create <Spider>(); spider.NewGuidId(); spider.Name = "RetryDownloadTimes"; spider.RetryDownloadTimes = 5; spider.EmptySleepTime = 15; spider.DownloaderSettings.Type = DownloaderType.Exception; var scheduler = new QueueDistinctBfsScheduler(); spider.Scheduler = scheduler; spider.AddRequests("http://www.RetryDownloadTimes.com"); spider.Run(); var statisticsStore = SpiderFactory.GetRequiredService <IStatisticsStore>(); var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result; Assert.Equal(1, s.Total); Assert.Equal(1, s.Failed); Assert.Equal(0, s.Success); var dss = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result; var ds = dss[0]; Assert.Equal(6, ds.Failed); Assert.Equal(0, ds.Success); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); if (_definition.NumberOfConcurrentRequests > 0) { Speed = _definition.NumberOfConcurrentRequests; } if (_definition.Deepth.GetValueOrDefault() > 0) { Depth = _definition.Deepth.Value; } if (_definition.PageLimit.GetValueOrDefault() > 0) { PageLimit = _definition.PageLimit.Value; } // DownloaderSettings. DownloaderSettings.Type = DownloaderType.HttpClient; DownloaderSettings.UseProxy = true; AddDataFlow(new SimpleItemDataParser(_definition.ItemUrlsSelector, _definition.Mapping)) .AddDataFlow(new HtmlFileStorage(_definition)); if (!string.IsNullOrWhiteSpace(_definition.NextPageSelector)) { AddDataFlow(new SimplePaginationDataParser(_definition.NextPageSelector)); } AddRequests(_definition.Urls.Split(';', StringSplitOptions.RemoveEmptyEntries)); //AddRequests(new string[] { "https://vnexpress.net/kinh-doanh" }); //AddRequests(new string[] { "https://vnexpress.net/kinh-doanh", "https://vnexpress.net/the-gioi", "https://vnexpress.net/goc-nhin", "https://vnexpress.net/the-thao", "https://vnexpress.net/phap-luat", "https://vnexpress.net/giao-duc" }); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; AddDataFlow(new CnblogsDataParser()).AddDataFlow(new JsonFileStorage()); AddRequests("https://news.cnblogs.com/"); }
protected override void Initialize() { Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; DownloaderSettings.Type = DownloaderType.HttpClient; AddDataFlow(new CnblogsDataParser()).AddDataFlow(new JsonFileStorage()); AddRequests("http://www.cnblogs.com/"); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; DownloaderSettings.Type = DownloaderType.HttpClient; AddDataFlow(new QuotesDataParser()).AddDataFlow(new ConsoleStorage()); AddRequests("http://quotes.toscrape.com/"); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 5; Depth = 2; PageLimit = 2; DownloaderSettings.Type = DownloaderType.HttpClient; AddDataFlow(new VnexpressItemLinksDataParser()).AddDataFlow(new HtmlFileStorage()).AddDataFlow(new VnexpressPaginationDataParser()); AddRequests(new string[] { "https://vnexpress.net/kinh-doanh" }); //AddRequests(new string[] { "https://vnexpress.net/kinh-doanh", "https://vnexpress.net/the-gioi", "https://vnexpress.net/goc-nhin", "https://vnexpress.net/the-thao", "https://vnexpress.net/phap-luat", "https://vnexpress.net/giao-duc" }); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; DownloaderSettings.Type = DownloaderType.HttpClient; AddDataFlow(new DataParser<CnblogsEntry>()).AddDataFlow(new SqlServerEntityStorage(StorageType.InsertIgnoreDuplicate,"Data Source=.;Initial Catalog=master;User Id=sa;Password='******'")); AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary<string, string> {{"网站", "博客园"}}), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary<string, string> {{"网站", "博客园"}})); }
public void ParallelEnqueueAndDequeueQueueBfs() { var scheduler = new QueueDistinctBfsScheduler(); var ownerId = Guid.NewGuid().ToString("N"); Parallel.For(0, 1000, new ParallelOptions { MaxDegreeOfParallelism = 20 }, i => { scheduler.Enqueue(new[] { new Request("http://www.a.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request("http://www.a.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request("http://www.a.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request("http://www.b.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request($"http://www.{i.ToString()}.com", null) { OwnerId = ownerId } }); }); Parallel.For(0, 1000, new ParallelOptions { MaxDegreeOfParallelism = 20 }, i => { scheduler.Dequeue(ownerId); }); Assert.Equal(2, scheduler.Requests[ownerId].Count); Assert.Equal(1002, scheduler.Total); }
public void ParallelEnqueueAndDequeueQueueBfs() { var scheduler = new QueueDistinctBfsScheduler(new HashSetDuplicateRemover()); var ownerId = Guid.NewGuid().ToString("N"); Parallel.For(0, 1000, new ParallelOptions { MaxDegreeOfParallelism = 20 }, async i => { await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request("http://www.b.com") { Owner = ownerId } }); await scheduler.EnqueueAsync(new[] { new Request($"http://www.{i.ToString()}.com", null) { Owner = ownerId } }); }); Parallel.For(0, 1000, new ParallelOptions { MaxDegreeOfParallelism = 20 }, async i => { await scheduler.DequeueAsync(); }); Assert.Equal(1002, scheduler.Total); }
protected override void Initialize() { Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; AddDataFlow(new DataParser <CnblogsEntry>()).AddDataFlow(GetDefaultStorage()); for (int i = 1; i < 10; ++i) { AddRequests( new Request($"https://news.cnblogs.com/n/page/{i}/", new Dictionary <string, string> { { "网站", "博客园" } }) ); } }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; AddDataFlow(new DatabaseSpiderDataParser()).AddDataFlow(GetDefaultStorage()); AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> { { "网站", "博客园" } }), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> { { "网站", "博客园" } })); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; AddDataFlow(new DataParser <CnblogsEntry>()).AddDataFlow(new MySqlEntityStorage( StorageType.InsertAndUpdate, "Database='mysql';Data Source=192.168.11.128;password=Yang123456.;User ID=root;Port=3306;")); AddRequests( new Request("http://datachart.500.com/ssq/", new Dictionary <string, string> { { "彩票", "双色球" } }) ); }
protected override void Initialize() { NewId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; DownloaderOptions.Type = DownloaderType.HttpClient; AddDataFlow(new DataParser <BaiduSearchEntry>()).AddDataFlow(GetDefaultStorage()); AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> { { "网站", "博客园" } }), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> { { "网站", "博客园" } })); }
protected override void Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; AddDataFlow(new DataParser <CnblogsEntry>()).AddDataFlow(new MySqlEntityStorage( StorageType.InsertAndUpdate, "Database='mysql';Data Source=zousong.com;password=1qazZAQ!;User ID=root;Port=3306;")); AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> { { "网站", "博客园" } }), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> { { "网站", "博客园" } })); }
public void EnqueueAndDequeueQueueBfs() { var ownerId = Guid.NewGuid().ToString("N"); var scheduler = new QueueDistinctBfsScheduler(); scheduler.Enqueue(new[] { new Request("http://www.a.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request("http://www.b.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request("http://www.a.com") { OwnerId = ownerId } }); scheduler.Enqueue(new[] { new Request("http://www.a.com") { OwnerId = ownerId } }); var request = scheduler.Dequeue(ownerId)[0]; Assert.Equal("http://www.a.com", request.Url); Assert.Single(scheduler.Requests[ownerId]); Assert.Equal(2, scheduler.Total); }
protected override async Task Initialize() { NewGuidId(); Scheduler = new QueueDistinctBfsScheduler(); Speed = 1; Depth = 3; AddDataFlow(new DataParser <CnblogsEntry>()) .AddDataFlow(GetDefaultStorage()); await AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> { { "网站", "博客园" } }) { UseProxy = true }, new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> { { "网站", "博客园" } }) { UseProxy = true }); }