public static void Run1()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory   = context => context.GetSelectable(ContentType.Html),
                CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com"),
                QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
Ejemplo n.º 2
0
        public void XpathFollow()
        {
            using (var builder = GetLocalSpiderHostBuilder())
            {
                var provider    = builder.Build();
                var services    = provider.CreateScopeServiceProvider();
                var dataContext =
                    new DataFlowContext(
                        new Response
                {
                    Request = new Request("http://cnblogs.com"),
                    Content = File.ReadAllBytes("cnblogs.html"),
                    CharSet = "UTF-8"
                }, services);
                if (dataContext.Selectable == null)
                {
                    dataContext.Selectable = dataContext.Response?.ToSelectable();
                }

                var xpathFollow = DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']");

                var requests = xpathFollow.Invoke(dataContext);

                Assert.Equal(12, requests.Count);
                Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2");
            }
        }
Ejemplo n.º 3
0
 public ListNewsParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/page");
     // 如果你还想翻页则可以去掉注释
     //FollowRequestQuerier =
     //	BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']"));
 }
Ejemplo n.º 4
0
        public static void Run1()
        {
            var builder = new SpiderHostBuilder()
                          .ConfigureLogging(x => x.AddSerilog())
                          .ConfigureAppConfiguration(x => x.AddJsonFile("appsettings.json"))
                          .ConfigureServices(services =>
            {
                services.AddLocalEventBus();
                services.AddLocalDownloadCenter();
                services.AddDownloaderAgent(x =>
                {
                    x.UseFileLocker();
                    x.UseDefaultAdslRedialer();
                    x.UseDefaultInternetDetector();
                });
                services.AddStatisticsCenter(x => x.UseMemory());
            });

            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N"); // 设置任务标识
            spider.Name  = "博客园全站采集";                    // 设置任务名称
            spider.Speed = 1;                            // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                            // 设置采集深度
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory = context => context.GetSelectable(ContentType.Html),
                Required          = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com"),
                GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
Ejemplo n.º 5
0
 public SimpleItemDataParser(string itemFollowSelectors, ItemMapping mapping)
 {
     //	CanParse = DataParserHelper.CanParseByRegex("vnexpress\\.com");
     if (!string.IsNullOrWhiteSpace(itemFollowSelectors))
     {
         var selectors = itemFollowSelectors.Split(';', StringSplitOptions.RemoveEmptyEntries);
         //new string[] { "//article/h1[@class='title_news']/a[1]", "//article[@class='list_news']/h4[@class='title_news']/a[1]" }
         if (selectors != null && selectors.Length > 0)
         {
             QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(selectors);
         }
     }
     _mapping = mapping;
 }
Ejemplo n.º 6
0
        public void XpathFollow()
        {
            var services    = SpiderProvider.Value.CreateScopeServiceProvider();
            var dataContext = new DataFlowContext(new Response
            {
                Request = new Request("http://cnblogs.com"),
                RawText = File.ReadAllText("cnblogs.html")
            }, services);
            var xpathFollow = DataParserHelper.QueryFollowRequestsByXPath(".//div[@class='pager']");

            var requests = xpathFollow.Invoke(dataContext);

            Assert.Equal(12, requests.Count);
            Assert.Contains(requests, r => r == "http://cnblogs.com/sitehome/p/2");
        }
Ejemplo n.º 7
0
 public NvshensFirstPageDetailDataParser()
 {
     RequireParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w+\\/\\d*\\/$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 8
0
 public NvshensPageTagDataParser()
 {
     RequireParse = DataParserHelper.CanParseByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/\\w*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 9
0
 public V_360KanDetailParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("/m/");
 }
Ejemplo n.º 10
0
 public NvshensTagIndexDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 11
0
 public YouMeiSpider()
 {
     //只处理第父页数据其他页数据交给其他类型处理器
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{1,2}.htm$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 12
0
 public NvshensFirstPageDetailDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.org\\/\\w+\\/\\d*\\/$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 13
0
 public NvshensPageTagDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.org\\/gallery\\/\\w*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 14
0
 public YouMeiDetailSpider()
 {
     //只处理详细页数据其他页数据交给其他类型处理器 http://www.umei.cc/p/gaoqing/cn/188495.htm
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,15}.htm$", "^((https|http)?:\\/\\/)www.umei.cc\\/p\\/gaoqing\\/cn\\/\\d{3,50}_\\d{1,3}.htm$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 15
0
 public MusicListDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^(http|https)?://music.163.com/#/playlist\\?id=[0-9]*");
 }
Ejemplo n.º 16
0
 public CnblogsDataParser()
 {
     Required          = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com");
     GetFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".");
 }
Ejemplo n.º 17
0
 public NvshensFirstPageTagDataParser()
 {
     //CanParse = RegexCanParse("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(((\\w)*\\/$)|(\\w*\\/\\d.html$))");
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http) ?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/(\\w)*\\/$");
 }
Ejemplo n.º 18
0
 public CnblogsDataParser()
 {
     Required             = DataParserHelper.CheckIfRequiredByRegex("cnblogs\\.com");
     FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath("."));
 }
Ejemplo n.º 19
0
 public V_360KanParser(SpiderStart start)
 {
     this._start          = start;
     Required             = DataParserHelper.CheckIfRequiredByRegex("/list");
     FollowRequestQuerier = BuildFollowRequestQuerier(DataParserHelper.QueryFollowRequestsByXPath("//div[@class='ew-page']/a[last()]"));
 }
Ejemplo n.º 20
0
 public NewsParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("news\\.cnblogs\\.com/n/\\d+");
 }
Ejemplo n.º 21
0
 public QuotesDataParser()
 {
     //CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com");
     QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("//li[@class='next']");
 }
Ejemplo n.º 22
0
 public NvshensPageDetailDataParser()
 {
     Required = DataParserHelper.CheckIfRequiredByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/\\w\\/\\d*\\/\\d+.html$");
     //Follow = XpathFollow(".");
 }
Ejemplo n.º 23
0
 public CnblogsDataParser()
 {
     CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com");
     QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("//div[@class='pager']/a[contains(text(),'Next')]");
 }
Ejemplo n.º 24
0
 public CnblogsDataParser()
 {
     CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com");
     QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".");
 }
 public NvshensTagIndexDataParser()
 {
     CanParse = DataParserHelper.CanParseByRegex("^((https|http)?:\\/\\/)www\\.nvshens\\.com\\/gallery\\/$");
     //Follow = XpathFollow(".");
 }