public static void Run1() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public void XpathFollow() { using (var builder = GetLocalSpiderHostBuilder()) { var provider = builder.Build(); var services = provider.CreateScopeServiceProvider(); var dataContext = new DataFlowContext( new Response { Request = new Request("http://cnblogs.com"), Content = File.ReadAllBytes("cnblogs.html"), CharSet = "UTF-8" }, services); if (dataContext.Selectable == null) { dataContext.Selectable = dataContext.Response?.ToSelectable(); } var xpathFollow = DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']"); var requests = xpathFollow.Invoke(dataContext); Assert.Equal(12, requests.Count); Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2"); } }
public ListNewsParser() { Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/page"); // 如果你还想翻页则可以去掉注释 //FollowRequestQuerier = // BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']")); }
public static void Run1() { var builder = new SpiderHostBuilder() .ConfigureLogging(x => x.AddSerilog()) .ConfigureAppConfiguration(x => x.AddJsonFile("appsettings.json")) .ConfigureServices(services => { services.AddLocalEventBus(); services.AddLocalDownloadCenter(); services.AddDownloaderAgent(x => { x.UseFileLocker(); x.UseDefaultAdslRedialer(); x.UseDefaultInternetDetector(); }); services.AddStatisticsCenter(x => x.UseMemory()); }); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), Required = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"), GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public SimpleItemDataParser(string itemFollowSelectors, ItemMapping mapping) { // CanParse = DataParserHelper.CanParseByRegex("vnexpress\\.com"); if (!string.IsNullOrWhiteSpace(itemFollowSelectors)) { var selectors = itemFollowSelectors.Split(';', StringSplitOptions.RemoveEmptyEntries); //new string[] { "//article/h1[@class='title_news']/a[1]", "//article[@class='list_news']/h4[@class='title_news']/a[1]" } if (selectors != null && selectors.Length > 0) { QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(selectors); } } _mapping = mapping; }
public void XpathFollow() { var services = SpiderProvider.Value.CreateScopeServiceProvider(); var dataContext = new DataFlowContext(new Response { Request = new Request("http://cnblogs.com"), RawText = File.ReadAllText("cnblogs.html") }, services); var xpathFollow = DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']"); var requests = xpathFollow.Invoke(dataContext); Assert.Equal(12, requests.Count); Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2"); }
public NvshensFirstPageDetailDataParser() { RequireParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w+\\/\\d*\\/$"); //Follow = XpathFollow("."); }
public NvshensPageTagDataParser() { RequireParse = DataParserHelper.CanParseByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/\\w*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public V_360KanDetailParser() { Required = DataParserHelper.CheckIfRequiredByRegex("/m/"); }
public NvshensTagIndexDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$"); //Follow = XpathFollow("."); }
public YouMeiSpider() { //只处理第父页数据其他页数据交给其他类型处理器 Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{1,2}.htm$"); //Follow = XpathFollow("."); }
public NvshensFirstPageDetailDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.org\\/\\w+\\/\\d*\\/$"); //Follow = XpathFollow("."); }
public NvshensPageTagDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.org\\/gallery\\/\\w*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public YouMeiDetailSpider() { //只处理详细页数据其他页数据交给其他类型处理器 http://www.umei.cc/p/gaoqing/cn/188495.htm Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,15}.htm$", "^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,50}_\\d{1,3}.htm$"); //Follow = XpathFollow("."); }
public MusicListDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^(http|https)?://music.163.com/#/playlist\\?id=[0-9]*"); }
public CnblogsDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"); GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("."); }
public NvshensFirstPageTagDataParser() { //CanParse = RegexCanParse("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(((\\w)*\\/$)|(\\w*\\/\\d.html$))"); Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(\\w)*\\/$"); }
public CnblogsDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"); FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath(".")); }
public V_360KanParser(SpiderStart start) { this._start = start; Required = DataParserHelper.CheckIfRequiredByRegex("/list"); FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath("//div[@class='ew-page']/a[last()]")); }
public NewsParser() { Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/\\d+"); }
public QuotesDataParser() { //CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("//li[@class='next']"); }
public NvshensPageDetailDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w\\/\\d*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public CnblogsDataParser() { CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("//div[@class='pager']/a[contains(text(),'Next')]"); }
public CnblogsDataParser() { CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("."); }
public NvshensTagIndexDataParser() { CanParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$"); //Follow = XpathFollow("."); }