Exemplo n.º 1
0
        public void TestExtractMeta()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://my.oschina.net/zhupingqi/blog/1826317");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var parser = new RuiJiParser();
            var eb     = parser.ParseExtract(@"
[meta]
	#title
	css h1.header:text

	#author
	css div.blog-meta .avatar + span:text

	#date
	css div.blog-meta > div.item:first:text
	regS /发布于/ 1

	#words_i
	css div.blog-meta > div.item:eq(1):text
	regS / / 1

	#content
	css #articleContent:html"    );

            var result = RuiJiExtractor.Extract(content, eb.Result);

            Assert.True(true);
        }
Exemplo n.º 2
0
        public void TestAdvExpression1()
        {
            var parser = new RuiJiParser();

            parser.ParseFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "expression_address.txt"));

            Assert.True(true);
        }
Exemplo n.º 3
0
        public void TestExtract2()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("https://www.oschina.net/blog");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var parser = new RuiJiParser();
            var eb     = parser.ParseExtract("css a.blog-title-link:[href]\nexp https://my.oschina.net/*/blog/*");
            var result = RuiJiExtractor.Extract(content, eb.Result);

            Assert.True(true);
        }
Exemplo n.º 4
0
        public List <string> ExtractAddress(FeedSnapshot feed)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (feed.UseBlock)
            {
                if (!string.IsNullOrEmpty(feed.BlockExpression))
                {
                    block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression);
                }
            }
            else
            {
                if (!string.IsNullOrEmpty(feed.RuiJiExpression))
                {
                    block.TileSelector.Selectors.Clear();

                    var parser = new RuiJiParser();

                    var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors;
                    block.TileSelector.Selectors.AddRange(s);
                }
            }

            var result  = RuiJiExtractor.Extract(feed.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Exemplo n.º 5
0
        protected List <string> ExtractFeedAddress(Snapshot snapshot)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (!string.IsNullOrEmpty(snapshot.Expression))
            {
                block.TileSelector.Selectors.Clear();

                var parser = new RuiJiParser();

                var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors;
                block.TileSelector.Selectors.AddRange(s);
            }

            var result  = RuiJiExtractor.Extract(snapshot.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Exemplo n.º 6
0
        public void TestExtractTile()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/archives/category/tech/bigdata");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var parser = new RuiJiParser();
            var eb     = parser.ParseExtract(@"[tile]
css article:html

    [meta]
	#title
	css .entry-header:text

	#summary
	css .entry-header + p:text
	ex /Read more »/ -e"    );

            var result = RuiJiExtractor.Extract(content, eb.Result);

            Assert.True(true);
        }
Exemplo n.º 7
0
        protected override List <FeedRequest> GetRequests()
        {
            Logger.GetLogger("").Info("start get feed");

            try
            {
                var requests = new List <FeedRequest>();
                var compile  = new UrlCompile();
                var files    = Directory.GetFiles(jobPath);

                foreach (var file in files)
                {
                    var extension = Path.GetExtension(file).ToLower();
                    if (extension != ".feed")
                    {
                        continue;
                    }

                    var parser = new RuiJiParser();
                    var result = parser.ParseFile(file);

                    if (result)
                    {
                        var request = parser.GetResult <Request>().Result;
                        var setting = parser.GetResult <FeedSetting>().Result;

                        if (request == null || setting == null)
                        {
                            continue;
                        }

                        var addrs = compile.GetResult(request.Uri.ToString());

                        for (int i = 0; i < addrs.Length; i++)
                        {
                            var addr = addrs[i].ToString();

                            var r = request.Clone() as Request;
                            r.Uri       = new Uri(addr);
                            setting.Id += "_" + i;
                            r.Tag       = JsonConvert.SerializeObject(setting);

                            var fr = new FeedRequest();
                            fr.Request    = r;
                            fr.Setting    = setting;
                            fr.Expression = parser.GetResult <ExtractBlock>().Expression;

                            requests.Add(fr);
                        }
                    }
                }

                return(requests);
            }
            catch (Exception ex)
            {
                Logger.GetLogger("").Info("get feed error " + ex.Message);

                return(new List <FeedRequest>());
            }
        }