public static void Run1() { var builder = new SpiderHostBuilder() .ConfigureLogging(x => x.AddSerilog()) .ConfigureAppConfiguration(x => x.AddJsonFile("appsettings.json")) .ConfigureServices(services => { services.AddLocalEventBus(); services.AddLocalDownloadCenter(); services.AddDownloaderAgent(x => { x.UseFileLocker(); x.UseDefaultAdslRedialer(); x.UseDefaultInternetDetector(); }); services.AddStatisticsCenter(x => x.UseMemory()); }); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), Required = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"), GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public ListNewsParser() { Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/page"); // 如果你还想翻页则可以去掉注释 //FollowRequestQuerier = // BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']")); }
public NvshensTagIndexDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$"); //Follow = XpathFollow("."); }
public NvshensPageDetailDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w\\/\\d*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public NvshensFirstPageTagDataParser() { //CanParse = RegexCanParse("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(((\\w)*\\/$)|(\\w*\\/\\d.html$))"); Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(\\w)*\\/$"); }
public NewsParser() { Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/\\d+"); }
public CnblogsDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"); FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath(".")); }
public CnblogsDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"); GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("."); }
public NvshensPageTagDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.org\\/gallery\\/\\w*\\/\\d+.html$"); //Follow = XpathFollow("."); }
public NvshensFirstPageDetailDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.org\\/\\w+\\/\\d*\\/$"); //Follow = XpathFollow("."); }
public YouMeiDetailSpider() { //只处理详细页数据其他页数据交给其他类型处理器 http://www.umei.cc/p/gaoqing/cn/188495.htm Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,15}.htm$", "^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,50}_\\d{1,3}.htm$"); //Follow = XpathFollow("."); }
public MusicListDataParser() { Required = DataParserHelper.CheckIfRequiredByRegex("^(http|https)?://music.163.com/#/playlist\\?id=[0-9]*"); }
public YouMeiSpider() { //只处理第父页数据其他页数据交给其他类型处理器 Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{1,2}.htm$"); //Follow = XpathFollow("."); }
public V_360KanParser(SpiderStart start) { this._start = start; Required = DataParserHelper.CheckIfRequiredByRegex("/list"); FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath("//div[@class='ew-page']/a[last()]")); }
public V_360KanDetailParser() { Required = DataParserHelper.CheckIfRequiredByRegex("/m/"); }