Пример #1
0
        public async Task EnqueueAndDequeueQueueBfs()
        {
            var ownerId   = Guid.NewGuid().ToString("N");
            var scheduler = new QueueDistinctBfsScheduler(new HashSetDuplicateRemover(), _hashAlgorithm);
            await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com")
                                                 {
                                                     Owner = ownerId
                                                 } });

            await scheduler.EnqueueAsync(new[] { new Request("http://www.b.com")
                                                 {
                                                     Owner = ownerId
                                                 } });

            await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com")
                                                 {
                                                     Owner = ownerId
                                                 } });

            await scheduler.EnqueueAsync(new[] { new Request("http://www.a.com")
                                                 {
                                                     Owner = ownerId
                                                 } });

            var request = (await scheduler.DequeueAsync()).First();

            Assert.Equal("http://www.a.com/", request.RequestUri.ToString());
            Assert.Equal(2, scheduler.Total);
        }
Пример #2
0
        public async Task RetryDownloadTimes()
        {
            var spider = LocalSpiderProvider.Value.Create <Spider>();

            spider.NewGuidId();
            spider.Name           = "RetryDownloadTimes";
            spider.EmptySleepTime = 15;
            var scheduler = new QueueDistinctBfsScheduler();

            spider.Scheduler = scheduler;
            spider.AddRequests(new Request("http://www.RetryDownloadTimes.com")
            {
                DownloaderType = DownloaderType.Exception, RetryTimes = 5
            });
            await spider.RunAsync();

            var statisticsStore = LocalSpiderProvider.Value.GetRequiredService <IStatisticsStore>();
            var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result;

            Assert.Equal(1, s.Total);
            Assert.Equal(1, s.Failed);
            Assert.Equal(0, s.Success);

            var dss = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result;
            var ds  = dss[0];

            Assert.Equal(6, ds.Failed);
            Assert.Equal(0, ds.Success);
        }
Пример #3
0
        public void RetryDownloadTimes()
        {
            var spider = SpiderFactory.Create <Spider>();

            spider.NewGuidId();
            spider.Name = "RetryDownloadTimes";
            spider.RetryDownloadTimes      = 5;
            spider.EmptySleepTime          = 15;
            spider.DownloaderSettings.Type = DownloaderType.Exception;
            var scheduler = new QueueDistinctBfsScheduler();

            spider.Scheduler = scheduler;
            spider.AddRequests("http://www.RetryDownloadTimes.com");
            spider.Run();

            var statisticsStore = SpiderFactory.GetRequiredService <IStatisticsStore>();
            var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result;

            Assert.Equal(1, s.Total);
            Assert.Equal(1, s.Failed);
            Assert.Equal(0, s.Success);

            var dss = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result;
            var ds  = dss[0];

            Assert.Equal(6, ds.Failed);
            Assert.Equal(0, ds.Success);
        }
Пример #4
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     if (_definition.NumberOfConcurrentRequests > 0)
     {
         Speed = _definition.NumberOfConcurrentRequests;
     }
     if (_definition.Deepth.GetValueOrDefault() > 0)
     {
         Depth = _definition.Deepth.Value;
     }
     if (_definition.PageLimit.GetValueOrDefault() > 0)
     {
         PageLimit = _definition.PageLimit.Value;
     }
     //	DownloaderSettings.
     DownloaderSettings.Type     = DownloaderType.HttpClient;
     DownloaderSettings.UseProxy = true;
     AddDataFlow(new SimpleItemDataParser(_definition.ItemUrlsSelector, _definition.Mapping))
     .AddDataFlow(new HtmlFileStorage(_definition));
     if (!string.IsNullOrWhiteSpace(_definition.NextPageSelector))
     {
         AddDataFlow(new SimplePaginationDataParser(_definition.NextPageSelector));
     }
     AddRequests(_definition.Urls.Split(';', StringSplitOptions.RemoveEmptyEntries));
     //AddRequests(new string[] { "https://vnexpress.net/kinh-doanh" });
     //AddRequests(new string[] { "https://vnexpress.net/kinh-doanh", "https://vnexpress.net/the-gioi", "https://vnexpress.net/goc-nhin", "https://vnexpress.net/the-thao", "https://vnexpress.net/phap-luat", "https://vnexpress.net/giao-duc" });
 }
Пример #5
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     AddDataFlow(new CnblogsDataParser()).AddDataFlow(new JsonFileStorage());
     AddRequests("https://news.cnblogs.com/");
 }
 protected override void Initialize()
 {
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     DownloaderSettings.Type = DownloaderType.HttpClient;
     AddDataFlow(new CnblogsDataParser()).AddDataFlow(new JsonFileStorage());
     AddRequests("http://www.cnblogs.com/");
 }
Пример #7
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     DownloaderSettings.Type = DownloaderType.HttpClient;
     AddDataFlow(new QuotesDataParser()).AddDataFlow(new ConsoleStorage());
     AddRequests("http://quotes.toscrape.com/");
 }
Пример #8
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 5;
     Depth     = 2;
     PageLimit = 2;
     DownloaderSettings.Type = DownloaderType.HttpClient;
     AddDataFlow(new VnexpressItemLinksDataParser()).AddDataFlow(new HtmlFileStorage()).AddDataFlow(new VnexpressPaginationDataParser());
     AddRequests(new string[] { "https://vnexpress.net/kinh-doanh" });
     //AddRequests(new string[] { "https://vnexpress.net/kinh-doanh", "https://vnexpress.net/the-gioi", "https://vnexpress.net/goc-nhin", "https://vnexpress.net/the-thao", "https://vnexpress.net/phap-luat", "https://vnexpress.net/giao-duc" });
 }
Пример #9
0
		protected override void Initialize()
		{
			NewGuidId();
			Scheduler = new QueueDistinctBfsScheduler();
			Speed = 1;
			Depth = 3;
			DownloaderSettings.Type = DownloaderType.HttpClient;
			AddDataFlow(new DataParser<CnblogsEntry>()).AddDataFlow(new SqlServerEntityStorage(StorageType.InsertIgnoreDuplicate,"Data Source=.;Initial Catalog=master;User Id=sa;Password='******'"));
			AddRequests(
				new Request("https://news.cnblogs.com/n/page/1/", new Dictionary<string, string> {{"网站", "博客园"}}),
				new Request("https://news.cnblogs.com/n/page/2/", new Dictionary<string, string> {{"网站", "博客园"}}));
		}
Пример #10
0
        public void ParallelEnqueueAndDequeueQueueBfs()
        {
            var scheduler = new QueueDistinctBfsScheduler();
            var ownerId   = Guid.NewGuid().ToString("N");

            Parallel.For(0, 1000, new ParallelOptions {
                MaxDegreeOfParallelism = 20
            }, i =>
            {
                scheduler.Enqueue(new[]
                {
                    new Request("http://www.a.com")
                    {
                        OwnerId = ownerId
                    }
                });
                scheduler.Enqueue(new[]
                {
                    new Request("http://www.a.com")
                    {
                        OwnerId = ownerId
                    }
                });
                scheduler.Enqueue(new[]
                {
                    new Request("http://www.a.com")
                    {
                        OwnerId = ownerId
                    }
                });
                scheduler.Enqueue(new[]
                {
                    new Request("http://www.b.com")
                    {
                        OwnerId = ownerId
                    }
                });
                scheduler.Enqueue(new[]
                {
                    new Request($"http://www.{i.ToString()}.com", null)
                    {
                        OwnerId = ownerId
                    }
                });
            });
            Parallel.For(0, 1000, new ParallelOptions {
                MaxDegreeOfParallelism = 20
            },
                         i => { scheduler.Dequeue(ownerId); });

            Assert.Equal(2, scheduler.Requests[ownerId].Count);
            Assert.Equal(1002, scheduler.Total);
        }
Пример #11
0
        public void ParallelEnqueueAndDequeueQueueBfs()
        {
            var scheduler = new QueueDistinctBfsScheduler(new HashSetDuplicateRemover());
            var ownerId   = Guid.NewGuid().ToString("N");

            Parallel.For(0, 1000, new ParallelOptions {
                MaxDegreeOfParallelism = 20
            }, async i =>
            {
                await scheduler.EnqueueAsync(new[]
                {
                    new Request("http://www.a.com")
                    {
                        Owner = ownerId
                    }
                });
                await scheduler.EnqueueAsync(new[]
                {
                    new Request("http://www.a.com")
                    {
                        Owner = ownerId
                    }
                });
                await scheduler.EnqueueAsync(new[]
                {
                    new Request("http://www.a.com")
                    {
                        Owner = ownerId
                    }
                });
                await scheduler.EnqueueAsync(new[]
                {
                    new Request("http://www.b.com")
                    {
                        Owner = ownerId
                    }
                });
                await scheduler.EnqueueAsync(new[]
                {
                    new Request($"http://www.{i.ToString()}.com", null)
                    {
                        Owner = ownerId
                    }
                });
            });
            Parallel.For(0, 1000, new ParallelOptions {
                MaxDegreeOfParallelism = 20
            },
                         async i => { await scheduler.DequeueAsync(); });

            Assert.Equal(1002, scheduler.Total);
        }
Пример #12
0
 protected override void Initialize()
 {
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     AddDataFlow(new DataParser <CnblogsEntry>()).AddDataFlow(GetDefaultStorage());
     for (int i = 1; i < 10; ++i)
     {
         AddRequests(
             new Request($"https://news.cnblogs.com/n/page/{i}/", new Dictionary <string, string> {
             { "网站", "博客园" }
         })
             );
     }
 }
Пример #13
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     AddDataFlow(new DatabaseSpiderDataParser()).AddDataFlow(GetDefaultStorage());
     AddRequests(
         new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> {
         { "网站", "博客园" }
     }),
         new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> {
         { "网站", "博客园" }
     }));
 }
Пример #14
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     AddDataFlow(new DataParser <CnblogsEntry>()).AddDataFlow(new MySqlEntityStorage(
                                                                  StorageType.InsertAndUpdate,
                                                                  "Database='mysql';Data Source=192.168.11.128;password=Yang123456.;User ID=root;Port=3306;"));
     AddRequests(
         new Request("http://datachart.500.com/ssq/", new Dictionary <string, string> {
         { "彩票", "双色球" }
     })
         );
 }
Пример #15
0
 protected override void Initialize()
 {
     NewId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     DownloaderOptions.Type = DownloaderType.HttpClient;
     AddDataFlow(new DataParser <BaiduSearchEntry>()).AddDataFlow(GetDefaultStorage());
     AddRequests(
         new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> {
         { "网站", "博客园" }
     }),
         new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> {
         { "网站", "博客园" }
     }));
 }
Пример #16
0
 protected override void Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     AddDataFlow(new DataParser <CnblogsEntry>()).AddDataFlow(new MySqlEntityStorage(
                                                                  StorageType.InsertAndUpdate,
                                                                  "Database='mysql';Data Source=zousong.com;password=1qazZAQ!;User ID=root;Port=3306;"));
     AddRequests(
         new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> {
         { "网站", "博客园" }
     }),
         new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> {
         { "网站", "博客园" }
     }));
 }
Пример #17
0
        public void EnqueueAndDequeueQueueBfs()
        {
            var ownerId   = Guid.NewGuid().ToString("N");
            var scheduler = new QueueDistinctBfsScheduler();

            scheduler.Enqueue(new[]
            {
                new Request("http://www.a.com")
                {
                    OwnerId = ownerId
                }
            });
            scheduler.Enqueue(new[]
            {
                new Request("http://www.b.com")
                {
                    OwnerId = ownerId
                }
            });
            scheduler.Enqueue(new[]
            {
                new Request("http://www.a.com")
                {
                    OwnerId = ownerId
                }
            });
            scheduler.Enqueue(new[]
            {
                new Request("http://www.a.com")
                {
                    OwnerId = ownerId
                }
            });

            var request = scheduler.Dequeue(ownerId)[0];

            Assert.Equal("http://www.a.com", request.Url);
            Assert.Single(scheduler.Requests[ownerId]);
            Assert.Equal(2, scheduler.Total);
        }
Пример #18
0
 protected override async Task Initialize()
 {
     NewGuidId();
     Scheduler = new QueueDistinctBfsScheduler();
     Speed     = 1;
     Depth     = 3;
     AddDataFlow(new DataParser <CnblogsEntry>())
     .AddDataFlow(GetDefaultStorage());
     await AddRequests(
         new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> {
         { "网站", "博客园" }
     })
     {
         UseProxy = true
     },
         new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> {
         { "网站", "博客园" }
     })
     {
         UseProxy = true
     });
 }