public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); var settings = new ProjectDefinition() { ProjectName = "Vnexpress Spider", Site = "Vnexpress/Kinh Doanh", ItemUrlsSelector = "", Urls = "https://vnexpress.net/kinh-doanh", FileFormat = "*.html", FileStorage = @"P:\Neil.Test\Spider Storage\Vnexpress", PageLimit = 4, }; builder.Services.AddSingleton <ProjectDefinition>(settings); builder.AddSpider <VnexpressSpider>(); // builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null); //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>(); builder.UseDynamicMessageQueue(); var factory = builder.Build(); var spider = factory.Create <VnexpressSpider>(); return(spider.RunAsync()); }
public void DoNotRetryWhenResultIsEmpty() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(null, null, false); builder.UseStandalone(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.NewGuidId(); spider.Name = "RetryWhenResultIsEmpty"; spider.EmptySleepTime = 15; spider.RetryDownloadTimes = 5; spider.RetryWhenResultIsEmpty = false; spider.DownloaderSettings.Type = DownloaderType.Empty; spider.Scheduler = new QueueDistinctBfsScheduler(); spider.AddRequests("http://www.DoNotRetryWhenResultIsEmpty.com"); spider.RunAsync().Wait(); var statisticsStore = provider.GetRequiredService <IStatisticsStore>(); var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result; var ds = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result[0]; Assert.Equal(1, s.Total); Assert.Equal(0, s.Failed); Assert.Equal(1, s.Success); Assert.Equal(0, ds.Failed); Assert.Equal(1, ds.Success); }
public static void Run() { ImageDownloader.GetInstance().Start(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "宅男女神图片采集"; // 设置任务名称 spider.Speed = 2; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 5; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient //spider.AddDataFlow(new NvshensTagIndexDataParser()); spider.AddDataFlow(new NvshensFirstPageTagDataParser()); spider.AddDataFlow(new NvshensPageTagDataParser()); spider.AddDataFlow(new NvshensFirstPageDetailDataParser()); spider.AddDataFlow(new NvshensPageDetailDataParser()); //spider.AddRequests("https://www.nvshens.com/gallery/"); // 设置起始链接 spider.AddRequests("https://www.nvshens.com/gallery/luoli/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public static void Run1() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
/// <summary> /// 运行 /// </summary> /// <param name="args">运行参数</param> public static void Run(params string[] args) { Framework.SetEncoding(); var configurationBuilder = Framework.CreateConfigurationBuilder(null, args); var configuration = configurationBuilder.Build(); string spider = configuration["spider"]; if (string.IsNullOrWhiteSpace(spider)) { throw new SpiderException("未指定需要执行的爬虫"); } var name = configuration["name"]; var id = configuration["id"] ?? Guid.NewGuid().ToString("N"); var config = configuration["config"]; var arguments = configuration["args"]?.Split(' '); var distribute = configuration["distribute"] == "true"; PrintEnvironment(args); var spiderTypes = DetectSpiders(); if (spiderTypes == null || spiderTypes.Count == 0) { return; } var spiderType = spiderTypes.FirstOrDefault(x => x.Name.ToLower() == spider.ToLower()); if (spiderType == null) { ConsoleHelper.WriteLine($"未找到爬虫: {spider}", 0, ConsoleColor.DarkYellow); return; } var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(config); if (!distribute) { builder.UseStandalone(); } builder.AddSpider(spiderType); var provider = builder.Build(); var instance = provider.Create(spiderType); if (instance != null) { instance.Name = name; instance.Id = id; instance.RunAsync(arguments); } else { ConsoleHelper.WriteLine("创建爬虫对象失败", 0, ConsoleColor.DarkYellow); } }
protected TestBase() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(null, null, false); builder.UseStandalone(); SpiderFactory = builder.Build(); SpiderFactory.GetRequiredService <ILogger <TestBase> >() .LogInformation($"Development {SpiderFactory.GetRequiredService<IConfiguration>()["Development"]}"); }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <VnexpressSpider>(); builder.Services.AddSingleton <IDynamicMessageQueue, InMemoryMessageQueue>((s) => null); //builder.Services.AddSingleton<IDynamicMessageQueue>(null as IDynamicMessageQueue); var factory = builder.Build(); var spider = factory.Create <VnexpressSpider>(); return(spider.RunAsync()); }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.Services.AddSingleton <IProxyValidator, FakeProxyValidator>(); //builder.Services.AddSingleton<IProxyValidator, DefaultProxyValidator>(); builder.ConfigureAppConfiguration(null, args: new string[] { "/ProxySupplyUrl=http://localhost:52445/api/proxies" }, true); builder.UseStandalone(); var settings = new ProjectDefinition() { ProjectName = "Vnexpress Spider", Site = "Vnexpress/Kinh Doanh", ItemUrlsSelector = "//article/h1[@class='title_news']/a[1];//article[@class='list_news']/h4[@class='title_news']/a[1]", Urls = "https://vnexpress.net/kinh-doanh", FileStorage = @"P:\Neil.Test\Spider Storage\Vnexpress", FileFormat = "*.json", PageLimit = 4, Deepth = 2, NextPageSelector = "//p[@id='pagination']/a[@class='next']", NumberOfConcurrentRequests = 5, Mapping = new ItemMapping { ItemCssSelector = "//section[@id='left_calculator']", Mapping = new FieldMapping[] { new FieldMapping { Field = "Title", CssSelector = "//h1[@class='title_news_detail mb10']" }, new FieldMapping { Field = "Description", CssSelector = "//p[@class='description']" }, } } }; builder.Services.AddSingleton <ProjectDefinition>(settings); builder.AddSpider <HttpClientSpider>(); // builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null); //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>(); builder.UseDynamicMessageQueue(); var factory = builder.Build(); var spider = factory.Create <HttpClientSpider>(); return(spider.RunAsync()); }
public static Task Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.NewGuidId(); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage()); spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 return(spider.RunAsync()); // 启动 }
public static async void Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(loadCommandLine: false); builder.UseStandalone(); builder.AddSpider <CnblogsSpider>(); var provider = builder.Build(); var spider = provider.Create <CnblogsSpider>(); //spider.Scheduler = new QueueBfsScheduler(); //spider.NewGuidId(); // 设置任务标识 //spider.Name = "博客园全站采集"; // 设置任务名称 //spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. // // spider.Depth = 3; // 设置采集深度 //spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient //spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage()); //spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 await spider.RunAsync(); // 启动 }