protected override void MyInit(params string[] arguments)
        {
            var word = "可乐|雪碧";

            AddPipeline(new DefaultMySqlPipeline(Core.Environment.DataConnectionString, "baidu", "mysql_baidu_search"));
            AddStartUrl(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary <string, dynamic> {
                { "Keyword", word }
            });

            var processor = new DefaultPageProcessor();

            processor.AddTargetUrlExtractor("//p[@id=\"page\"]", "&pn=[0-9]+&");
            AddPageProcessor(processor);
        }
Пример #2
0
        public void SpiderTest_Success()
        {
            var site      = new Site();
            var processor = new DefaultPageProcessor();
            var spider    = new Spider(site, processor);

            Assert.IsNotNull(spider);
            Assert.IsNotNull(spider.Scheduler);
            Assert.IsNotNull(spider.DownLoader);
            Assert.IsNotNull(spider.Site);
            Assert.IsNotNull(spider.PageProcessor);
            Assert.IsTrue(spider.Status == SpiderStatusEnum.Init);
            Assert.IsTrue(spider.Pipelines != null);
            Assert.IsTrue(spider.SpiderListening != null);
        }
        protected override void OnInit(params string[] arguments)
        {
            var word = "可乐|雪碧";

            AddPipeline(new DefaultMySqlPipeline(Env.DataConnectionString, "baidu", "mysql_baidu_search"));
            AddRequest(string.Format("http://news.baidu.com/ns?word={0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary <string, dynamic> {
                { "Keyword", word }
            });

            var processor = new DefaultPageProcessor();

            processor.RequestExtractor = new XPathRequestExtractor("//p[@id=\"page\"]");
            processor.Filter           = new PatternFilter(new[] { "&pn=[0-9]+&" });
            AddPageProcessors(processor);
        }
Пример #4
0
        public void RunTest_Success()
        {
            var site = new Site {
                ThreadCount = 30
            };
            var pageProcessor = new DefaultPageProcessor();
            var htmlPiepline  = new HtmlFilePiepline();
            var filePiepline  = new FilePipeline();
            var listening     = new ConsoleSpiderListening();
            var spider        = new Spider(site, pageProcessor);

            spider.AddSeedUrl("http://sh.lianjia.com/")
            .AddPiepline(htmlPiepline)
            .AddPiepline(filePiepline)
            .AddListening(listening)
            .Run();
        }