Пример #1
0
        public void TestPaging()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.kuaidaili.com/free/inha/1/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiExpression.PaserBlock(@"
[tile]
	css table.table-bordered tr:gt(0):ohtml

	[meta]
	#ip
	css td[data-title='IP']:text

    # port
    css td[data-title='PORT']:text

[paging]
css #listnav a[href]
");

            var result = RuiJiExtracter.Extract(content, s);

            Assert.IsTrue(true);
        }
Пример #2
0
        public void TestBlock()
        {
            var block = @"
#name
css .entry-content:html

[blocks]
    @block1
    @block2

[tile]
    #aa
    css a:ohtml

    [meta]
    #time
    css time:text

[meta]
    #time
    css time:text

    #author
    css .author:text

    #title
    css .entry-title:text

    #content
    css .entry-content:html

    #link
    css h4 a[href] -r";

            block = @"



[meta]
    #time
    css time:text

    #author
    css .author:text

    #title
    css .entry-title:text

    #content
    css .entry-content:html

    #link
    css h4 a[href] -r";


            var m = RuiJiExpression.PaserBlock(block);

            Assert.IsTrue(m.Metas.Count > 0);
        }
Пример #3
0
        public object Run(object t, ParallelTask task)
        {
            var model = t as CrawlTaskModel;

            var results  = new List <ExtractResult>();
            var reporter = task.Progress as IProgress <string>;

            reporter.Report("正在读取Feed记录");
            var feed = FeedLiteDb.GetFeed(model.FeedId);

            reporter.Report("正在下载 Feed");

            var compile = new CompileFeedAddress();

            feed.Address = compile.Compile(feed.Address);

            var job  = new FeedJob();
            var snap = job.DoTask(feed, false);

            reporter.Report("Feed 下载完成");

            var block = RuiJiExpression.ParserBlock(feed.RuiJiExpression);

            var feedResult = RuiJiExtracter.Extract(snap.Content, block);

            results.Add(feedResult);

            reporter.Report("正在提取Feed地址");
            var j    = new FeedExtractJob();
            var urls = j.ExtractAddress(snap);

            reporter.Report("Feed地址提取完成");

            foreach (var url in urls)
            {
                reporter.Report("正在提取地址 " + url);
                var r = ContentQueue.Instance.Extract(url);

                results.AddRange(r);
            }

            reporter.Report("计算完成");

            if (!model.IncludeContent)
            {
                results.ForEach((m) =>
                {
                    ClearContent(m);
                });
            }

            return(results);
        }
Пример #4
0
        public void TestMeta()
        {
            var metas =
                @"
                #css
                css h4 a[href] -r
                css h4:ohtml
                css h4:html -r
                css h4 a:text

                #exclude
                ex /:/ -b
                ex /\-e/ -e
                ex /\-/ -a
                ex /[\d]*/

                #expression
                exp ????/??/?? ??:??:??* 
                exp datetime_?? -r

                #regex
                reg /[\d]*/
                reg /aa([\d]*)sf/ 0 1
                reg /aa([\d]*)sf/ -r

                #regexReplace
                regR /aaaa/ dddd/
                
                #regexSplit
                regS /aaa/ 2 3 5
                regS /aaa/ 2 3 5 -r

                #textRange
                text /a\naa/ /b\tbb/
                text /aaa/ /bbb/ -r

                #xpath
                xpath ladkfeio
                xpath dlqwekrjl -r

                #jsonPath
                jpath dlsldf.kljs
                jpath dlkejl -r
                ";

            var m = RuiJiExpression.PaserMeta(metas);

            Assert.IsTrue(m.Count > 0);
        }
Пример #5
0
        public void TestExtract()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiExpression.ParserBase("css a[href]").Selectors;

            block.TileSelector.Selectors.AddRange(s);
            var result = RuiJiExtracter.Extract(content, block);

            Assert.IsTrue(true);
        }
Пример #6
0
        public List <string> ExtractAddress(FeedSnapshot feed)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (feed.UseBlock)
            {
                if (!string.IsNullOrEmpty(feed.BlockExpression))
                {
                    block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression);
                }
            }
            else
            {
                if (!string.IsNullOrEmpty(feed.RuiJiExpression))
                {
                    block.TileSelector.Selectors.Clear();
                    var s = RuiJiExpression.ParserBase(feed.RuiJiExpression).Selectors;
                    block.TileSelector.Selectors.AddRange(s);
                }
            }

            var result  = RuiJiExtracter.Extract(feed.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content;
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Пример #7
0
        public object UrlRule(string url, bool useBlock = false)
        {
            var node = ServerManager.Get(Request.RequestUri.Authority);

            if (node.NodeType == Node.NodeTypeEnum.FEEDPROXY)
            {
                if (useBlock)
                {
                    return(RuleLiteDB.Match(url).Select(m => JsonConvert.DeserializeObject <ExtractBlock>(m.BlockExpression)).ToList());
                }
                else
                {
                    return(RuleLiteDB.Match(url).Select(m => RuiJiExpression.ParserBlock(m.RuiJiExpression)).ToList());
                }
            }

            return(new { });
        }
Пример #8
0
        public void TestJsonPExtract()
        {
            var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1";

            var f = new CompileFeedAddress();

            url = f.Compile(url);

            var c        = new RuiJiCrawler();
            var response = c.Request(new Request(url));

            var expression = @"
reg /jsonp[\d]+?\((.*)\)/ 1
jpath $..url
";
            var b          = RuiJiExpression.ParserBlock(expression);
            var result     = RuiJiExtracter.Extract(response.Data.ToString(), b);

            Assert.IsTrue(result.Content.Length > 0);
        }