Exemplo n.º 1
0
        public void TestPaging2()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://3w.huanqiu.com/a/4e2d56fd7f51/7DHitRASkPC?p=1&agt=8");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var exp = @"
[meta]
	#title
	css h1.a-title

	#date_dt
	css .time:text

	#content
	css .a-con:ohtml

[paging]
css .a-page
css a[href]";

            var block  = RuiJiBlockParser.ParserBlock(exp);
            var result = RuiJiExtractor.Extract(content, block);

            if (result.Paging != null && result.Paging.Count > 0 && result.Metas != null && result.Metas.ContainsKey("content"))
            {
                result = PagingExtractor.MergeContent(request.Uri, result, block);
            }

            Assert.True(true);
        }
Exemplo n.º 2
0
        public void TestPaging()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.kuaidaili.com/free/inha/1/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiBlockParser.ParserBlock(@"
[tile]
	css table.table-bordered tr:gt(0):ohtml

	[meta]
	#ip
	css td[data-title='IP']:text

    # port
    css td[data-title='PORT']:text

[paging]
css #listnav a:[href]
");

            var result = RuiJiExtractor.Extract(content, s);

            Assert.True(true);
        }
Exemplo n.º 3
0
        public object TestFeed(FeedModel feed, [FromUri] bool down, [FromUri] bool debug = false)
        {
            try
            {
                var compile = new UrlCompile();
                var addrs   = compile.GetResult(feed.Address);
                var results = new List <ExtractResult>();

                foreach (var addr in addrs)
                {
                    feed.Address = addr.ToString();
                    var job  = new FeedJob();
                    var snap = job.DoTask(feed, false);

                    if (string.IsNullOrEmpty(feed.RuiJiExpression))
                    {
                        results.Add(new ExtractResult());
                        continue;
                    }

                    var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression);

                    var result = RuiJiExtractor.Extract(snap.Content, block);

                    if (!debug)
                    {
                        CrawlTaskFunc.ClearContent(result);
                    }

                    if (down)
                    {
                        var s = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download"));

                        var files = result.Content.ToString().Replace("\r\n", "\n").Split('\n');
                        foreach (var file in files)
                        {
                            if (!string.IsNullOrEmpty(file) && Uri.IsWellFormedUriString(file, UriKind.Absolute))
                            {
                                var res = Crawler.Request(file);
                                var c   = new DownloadContentModel();
                                c.Url   = file.Trim();
                                c.IsRaw = res.IsRaw;
                                c.Data  = res.Data;

                                s.Insert(c);
                            }
                        }
                    }

                    results.Add(result);
                }

                return(results);
            }
            catch (Exception ex)
            {
                return(ex);
            }
        }
Exemplo n.º 4
0
 public object MatchUrlRule(string url, bool useBlock = false)
 {
     if (useBlock)
     {
         return(RuleLiteDb.Match(url).Select(m => new ExtractFeatureBlock(JsonConvert.DeserializeObject <ExtractBlock>(m.BlockExpression), m.Feature)).ToList());
     }
     else
     {
         return(RuleLiteDb.Match(url).Select(m => new ExtractFeatureBlock(RuiJiBlockParser.ParserBlock(m.RuiJiExpression), m.Feature)).ToList());
     }
 }
Exemplo n.º 5
0
        public void TestMeta()
        {
            var metas =
                @"
                #css
                css h4 a[href] -r
                css h4:ohtml
                css h4:html -r
                css h4 a:text

                #exclude
                ex /:/ -b
                ex /\-e/ -e
                ex /\-/ -a
                ex /[\d]*/

                #expression
                exp ????/??/?? ??:??:??* 
                exp datetime_?? -r

                #regex
                reg /[\d]*/
                reg /aa([\d]*)sf/ 0 1
                reg /aa([\d]*)sf/ -r

                #regexReplace
                regR />>/ >
                
                #regexSplit
                regS /aaa/ 2 3 5
                regS /aaa/ 2 3 5 -r

                #textRange
                text /a\naa/ /b\tbb/
                text /aaa/ /bbb/ -r

                #xpath
                xpath ladkfeio
                xpath dlqwekrjl -r

                #jsonPath
                jpath dlsldf.kljs
                jpath dlkejl -r
                ";

            var m = RuiJiBlockParser.ParserMeta(metas);

            Assert.True(m.Count > 0);
        }
Exemplo n.º 6
0
        public void TestReg()
        {
            var exp = @"
[block]

[blocks]
@block1
@block2

[tile]
css img

	[meta]
	#title
	css img:[title]
	regR /aabbcc/
	regR /aabbcc/ 123 

	#src
	css img:[src]
	reg /aabbcc/ 
	reg /aabbcc/ 1
	reg /aabbcc/ 1 2
	regS /aabbcc/
	regS /aabbcc/ 1
	regS /aabbcc/ 1 2

	[paging]
	css #listnav a:[href]

[paging]
css #listnav a:[href]

[block]
#block1
css .list1

[block]
#block2
css .list2
";

            var m = RuiJiBlockParser.ParserBlock(exp);
            var j = JsonConvert.SerializeObject(m);

            exp = Converter.ToExpression(m);

            Assert.True(m.Metas.Count > 0);
        }
Exemplo n.º 7
0
        public void TestExpressionType()
        {
            var block = @"
[block]
#name_dda_ee
css .entry-content:html

[blocks]
    @block1
    @block2

[tile]
    #aa_l
    css a:ohtml

    [meta]
        #time_dt
        css time:text

[meta]
    #time_dt
    css time:text

    #words_i
    css .author:text

    #score_d
    css .entry-title:text

    #score_1_f
    css .entry-content:html

    #hasLink_b
    css h4 a:[href] -r

[block]
#block1
css .list1

[block]
#block2
css .list2
";

            var m = RuiJiBlockParser.ParserBlock(block);

            Assert.True(m.Metas.Count > 0);
        }
Exemplo n.º 8
0
        public List <string> ExtractAddress(FeedSnapshot feed)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (feed.UseBlock)
            {
                if (!string.IsNullOrEmpty(feed.BlockExpression))
                {
                    block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression);
                }
            }
            else
            {
                if (!string.IsNullOrEmpty(feed.RuiJiExpression))
                {
                    block.TileSelector.Selectors.Clear();

                    var parser = new RuiJiParser();

                    var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors;
                    block.TileSelector.Selectors.AddRange(s);
                }
            }

            var result  = RuiJiExtractor.Extract(feed.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Exemplo n.º 9
0
        public void TestExtract()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiBlockParser.ParserBase("css a:[href]").Selectors;

            block.TileSelector.Selectors.AddRange(s);
            var result = RuiJiExtractor.Extract(content, block);

            Assert.True(true);
        }
Exemplo n.º 10
0
        public void TestBlock()
        {
            var block = @"
[block]
#name
css .entry-content:html

[blocks]
    @block1
    @block2

[tile]
    #aa
    css a:ohtml

    [meta]
    #time
    css time:text

[meta]
    #time
    css time:text

    #author
    css .author:text

    #title
    css .entry-title:text

    #content
    css .entry-content:html

    #link
    css h4 a[href] -r
[block]
#block1
css .list1

[block]
#block2
css .list2
";

            var m = RuiJiBlockParser.ParserBlock(block);

            Assert.True(m.Metas.Count > 0);
        }
Exemplo n.º 11
0
        public void TestJC2()
        {
            var exp = @"
[block]

[blocks]
@block1
@block2

[tile]
css img

	[meta]
	#title
	css img:[title]
	proc aabbcc

	#src
	css img:[src]

	[paging]
	css #listnav a:[href]

[paging]
css #listnav a:[href]

[block]
#block1
css .list1

[block]
#block2
css .list2
";

            var b = RuiJiBlockParser.ParserBlock(exp);

            var json = JsonConvert.SerializeObject(b);

            b = JsonConvert.DeserializeObject <ExtractBlock>(json);

            exp = Converter.ToExpression(b);

            Assert.True(b.Metas.Count > 0);
        }
Exemplo n.º 12
0
        public void TestJsonPExtract()
        {
            var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1";

            var f = new UrlCompile();
            //url = f.Compile(url);

            var c        = new RuiJiCrawler();
            var response = c.Request(new Request(url));

            var expression = @"
reg /jsonp[\d]+?\((.*)\)/ 1
jpath $..url
";
            var b          = RuiJiBlockParser.ParserBlock(expression);
            var result     = RuiJiExtractor.Extract(response.Data.ToString(), b);

            Assert.IsTrue(result.Content.ToString().Length > 0);
        }
Exemplo n.º 13
0
        public object TestRule([FromBody] RuleModel rule, bool debug = false)
        {
            var request = new Request(rule.Url);

            request.Method = rule.Method;
            request.RunJS  = (rule.RunJS == Status.ON);
            if (request.RunJS)
            {
                request.WaitDom = request.WaitDom;
            }

            var response = Crawler.Request(request);

            if (response != null && response.Data != null)
            {
                var content = response.Data.ToString();
                var block   = RuiJiBlockParser.ParserBlock(rule.RuiJiExpression);
                var r       = new ExtractRequest();
                r.Content = content;

                r.Blocks = new List <ExtractFeatureBlock> {
                    new ExtractFeatureBlock(block, rule.Feature)
                };

                var results = Extractor.Extract(r);

                var result = results.OrderByDescending(m => m.Metas.Count).FirstOrDefault();

                if (result != null && result.Paging != null && result.Paging.Count > 0 && result.Metas != null && result.Metas.ContainsKey("content"))
                {
                    result = PagingExtractor.MergeContent(new Uri(rule.Url), result, block);
                }

                if (!debug)
                {
                    CrawlTaskFunc.ClearContent(result);
                }

                return(result);
            }

            return(new { });
        }
Exemplo n.º 14
0
        public void TestPaging()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.kuaidaili.com/free/inha/10");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var exp = @"
[tile]
	css table.table-bordered tr:gt(0):ohtml

	[meta]
	#ip
	css td[data-title='IP']:text

    #port
    css td[data-title='PORT']:text

[paging]
css #listnav a[href]";

            var block  = RuiJiBlockParser.ParserBlock(exp);
            var result = RuiJiExtractor.Extract(content, block);

            if (result.Paging != null && result.Paging.Count > 0 && result.Tiles != null)
            {
                var storage = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download"));

                PagingExtractor.CrawlPage(request.Uri, result, block, (u, res) => {
                    var c   = new DownloadContentModel();
                    c.Url   = u.AbsolutePath.Trim();
                    c.IsRaw = false;
                    c.Data  = JsonConvert.SerializeObject(res.Tiles);

                    storage.Insert(c);
                }, int.MaxValue);
            }

            Assert.True(true);
        }
Exemplo n.º 15
0
        /// <summary>
        /// constructor
        /// </summary>
        /// <param name="block">extract block</param>
        /// <param name="feature">extract feature</param>
        public ExtractFeatureBlock(ExtractBlock block, string feature = "")
        {
            this.Block = block;
            var selectors = new List <ISelector>();

            if (string.IsNullOrEmpty(feature))
            {
                return;
            }

            var sp = feature.Replace("\r\n", "\n").Split('\n');

            foreach (var s in sp)
            {
                var selector = RuiJiBlockParser.ParserSelector(s);
                selectors.Add(selector);
            }

            ExtractFeature = new ExtractFeature();

            ExtractFeature.Features = selectors;
        }
Exemplo n.º 16
0
        public void TestBlock()
        {
            var block = @"
[block]

[blocks]
@block1
@block2

[tile]
css img

	[meta]
	#title
	css img:[title]

	#src
	css img:[src]

	[paging]
	css #listnav a:[href]

[paging]
css #listnav a:[href]

[block]
#block1
css .list1

[block]
#block2
css .list2
";

            var m   = RuiJiBlockParser.ParserBlock(block);
            var exp = Converter.ToExpression(m);

            Assert.True(m.Metas.Count > 0);
        }
Exemplo n.º 17
0
        protected List <string> ExtractFeedAddress(Snapshot snapshot)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (!string.IsNullOrEmpty(snapshot.Expression))
            {
                block.TileSelector.Selectors.Clear();

                var parser = new RuiJiParser();

                var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors;
                block.TileSelector.Selectors.AddRange(s);
            }

            var result  = RuiJiExtractor.Extract(snapshot.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Exemplo n.º 18
0
        public object Run(object t, ParallelTask task)
        {
            var model = t as CrawlTaskModel;

            var results  = new List <object>();
            var reporter = task.Progress as IProgress <string>;

            reporter.Report("正在读取Feed记录");
            var feed = FeedLiteDb.GetFeed(model.FeedId);

            reporter.Report("正在下载 Feed");

            var compile = new UrlCompile();
            var addrs   = compile.GetResult(feed.Address);

            foreach (var addr in addrs)
            {
                feed.Address = addr.ToString();

                var job  = new FeedJob();
                var snap = job.DoTask(feed, false);
                reporter.Report("Feed 下载完成");

                var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression);

                var feedResult = RuiJiExtractor.Extract(snap.Content, block);
                results.Add(feedResult);

                reporter.Report("正在提取Feed地址");
                var j    = new FeedExtractJob();
                var urls = j.ExtractAddress(snap);
                reporter.Report("Feed地址提取完成");

                if (!string.IsNullOrEmpty(snap.RuiJiExpression))
                {
                    foreach (var url in urls)
                    {
                        reporter.Report("正在提取地址 " + url);
                        var result = Cooperater.GetResult(url);

                        if (result != null)
                        {
                            var cm = new ContentModel();
                            cm.Id    = model.FeedId;
                            cm.Url   = url;
                            cm.Metas = result.Metas;
                            cm.CDate = DateTime.Now;

                            results.Add(cm);
                        }
                    }
                }

                reporter.Report("计算完成");

                if (!model.IncludeContent)
                {
                    results.ForEach((m) =>
                    {
                        ClearContent(m);
                    });
                }
            }

            return(results);
        }
Exemplo n.º 19
0
        public void TestMeta()
        {
            var metas =
                @"
[block]
css #content_left

[tile]
css .result

	[meta]
	#title
	css h3.c-title:text

	#src
	css h3.c-title a:[href]

	#media
	css .c-author:text
	regS /\s+/ 0

	#date
	css .c-author:text
	regS /\s+/ 1

	#summary
	css .c-summary
	css .c-info -r
	css .c-author:text -r

	#text
	css .c-summary:text
	text /bmw/ /bmw/

	#regS
	css .c-summary
	regS /bmw/ 1

	#regR
	css .c-summary
	regR /bmw/ aabbcc

	#ex
	css .c-summary
	ex /bmw/ -b

	#exp
	css .c-summary
	exp http://*.baidu.com/* /\s+/

	#jpath
	css .c-summary
	jpath ..url

	#xpath
	css .c-summary
	xpath /aa/bb/c:[data]
    xpath /aa/bb/c:text
    xpath /aa/bb/c:xml
    xpath /aa/bb/c

	#clear
	css .c-summary
	clear span em
                ";

            var m = RuiJiBlockParser.ParserMeta(metas);

            Assert.True(m.Count > 0);
        }