Beispiel #1
0
        public static void Run1()
        {
            var builder = new SpiderHostBuilder()
                          .ConfigureLogging(x => x.AddSerilog())
                          .ConfigureAppConfiguration(x => x.AddJsonFile("appsettings.json"))
                          .ConfigureServices(services =>
            {
                services.AddLocalEventBus();
                services.AddLocalDownloadCenter();
                services.AddDownloaderAgent(x =>
                {
                    x.UseFileLocker();
                    x.UseDefaultAdslRedialer();
                    x.UseDefaultInternetDetector();
                });
                services.AddStatisticsCenter(x => x.UseMemory());
            });

            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N"); // 设置任务标识
            spider.Name  = "博客园全站采集";                    // 设置任务名称
            spider.Speed = 1;                            // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                            // 设置采集深度
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory = context => context.GetSelectable(ContentType.Html),
                Required          = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"),
                GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
 public ListNewsParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/page");
     // 如果你还想翻页则可以去掉注释
     //FollowRequestQuerier =
     //	BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']"));
 }
Beispiel #3
0
 public NvshensTagIndexDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$");
     //Follow = XpathFollow(".");
 }
Beispiel #4
0
 public NvshensPageDetailDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w\\/\\d*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
Beispiel #5
0
 public NvshensFirstPageTagDataParser()
 {
     //CanParse = RegexCanParse("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(((\\w)*\\/$)|(\\w*\\/\\d.html$))");
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(\\w)*\\/$");
 }
 public NewsParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/\\d+");
 }
Beispiel #7
0
 public CnblogsDataParser()
 {
     Required             = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com");
     FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath("."));
 }
Beispiel #8
0
 public CnblogsDataParser()
 {
     Required          = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com");
     GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".");
 }
Beispiel #9
0
 public NvshensPageTagDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.org\\/gallery\\/\\w*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
Beispiel #10
0
 public NvshensFirstPageDetailDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.org\\/\\w+\\/\\d*\\/$");
     //Follow = XpathFollow(".");
 }
Beispiel #11
0
 public YouMeiDetailSpider()
 {
     //只处理详细页数据其他页数据交给其他类型处理器 http://www.umei.cc/p/gaoqing/cn/188495.htm
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,15}.htm$", "^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,50}_\\d{1,3}.htm$");
     //Follow = XpathFollow(".");
 }
Beispiel #12
0
 public MusicListDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^(http|https)?://music.163.com/#/playlist\\?id=[0-9]*");
 }
Beispiel #13
0
 public YouMeiSpider()
 {
     //只处理第父页数据其他页数据交给其他类型处理器
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{1,2}.htm$");
     //Follow = XpathFollow(".");
 }
Beispiel #14
0
 public V_360KanParser(SpiderStart start)
 {
     this._start          = start;
     Required             = DataParserHelper.CheckIfRequiredByRegex("/list");
     FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath("//div[@class='ew-page']/a[last()]"));
 }
Beispiel #15
0
 public V_360KanDetailParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("/m/");
 }