public static void Run1() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public static void Run1() { var builder = new SpiderHostBuilder() .ConfigureLogging(x => x.AddSerilog()) .ConfigureAppConfiguration(x => x.AddJsonFile("appsettings.json")) .ConfigureServices(services => { services.AddLocalMessageQueue(); services.AddLocalDownloaderAgent(x => { x.UseFileLocker(); x.UseDefaultAdslRedialer(); x.UseDefaultInternetDetector(); }); services.AddLocalDownloadCenter(); services.AddSpiderStatisticsCenter(x => x.UseMemory()); }); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public CnblogsDataParser() { CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("."); }
public NvshensTagIndexDataParser() { RequireParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$"); //Follow = XpathFollow("."); }
public NvshensPageDetailDataParser() { RequireParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w\\/\\d*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public NvshensFirstPageTagDataParser() { //CanParse = RegexCanParse("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(((\\w)*\\/$)|(\\w*\\/\\d.html$))"); RequireParse = DataParserHelper.CanParseByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(\\w)*\\/$"); }
public NvshensFirstPageDetailDataParser() { CanParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w+\\/\\d*\\/$"); //Follow = XpathFollow("."); }
public NvshensPageTagDataParser() { CanParse = DataParserHelper.CanParseByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/\\w*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public CnblogsDataParser() { CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("//div[@class='pager']/a[contains(text(),'Next')]"); }