public static void Run1()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory   = context => context.GetSelectable(ContentType.Html),
                CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com"),
                QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
Exemple #2
0
        public static void Run1()
        {
            var builder = new SpiderHostBuilder()
                          .ConfigureLogging(x => x.AddSerilog())
                          .ConfigureAppConfiguration(x => x.AddJsonFile("appsettings.json"))
                          .ConfigureServices(services =>
            {
                services.AddLocalMessageQueue();
                services.AddLocalDownloaderAgent(x =>
                {
                    x.UseFileLocker();
                    x.UseDefaultAdslRedialer();
                    x.UseDefaultInternetDetector();
                });
                services.AddLocalDownloadCenter();
                services.AddSpiderStatisticsCenter(x => x.UseMemory());
            });

            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory   = context => context.GetSelectable(ContentType.Html),
                CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com"),
                QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
Exemple #3
0
 public CnblogsDataParser()
 {
     CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com");
     QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".");
 }
Exemple #4
0
 public NvshensTagIndexDataParser()
 {
     RequireParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$");
     //Follow = XpathFollow(".");
 }
Exemple #5
0
 public NvshensPageDetailDataParser()
 {
     RequireParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w\\/\\d*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
Exemple #6
0
 public NvshensFirstPageTagDataParser()
 {
     //CanParse = RegexCanParse("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(((\\w)*\\/$)|(\\w*\\/\\d.html$))");
     RequireParse = DataParserHelper.CanParseByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(\\w)*\\/$");
 }
 public NvshensFirstPageDetailDataParser()
 {
     CanParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w+\\/\\d*\\/$");
     //Follow = XpathFollow(".");
 }
 public NvshensPageTagDataParser()
 {
     CanParse = DataParserHelper.CanParseByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/\\w*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
 public CnblogsDataParser()
 {
     CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com");
     QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("//div[@class='pager']/a[contains(text(),'Next')]");
 }