Beispiel #1
0
        public List <string> ExtractAddress(FeedSnapshot feed)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (feed.UseBlock)
            {
                if (!string.IsNullOrEmpty(feed.BlockExpression))
                {
                    block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression);
                }
            }
            else
            {
                if (!string.IsNullOrEmpty(feed.RuiJiExpression))
                {
                    block.TileSelector.Selectors.Clear();

                    var parser = new RuiJiParser();

                    var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors;
                    block.TileSelector.Selectors.AddRange(s);
                }
            }

            var result  = RuiJiExtractor.Extract(feed.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }
Beispiel #2
0
        public void TestExtract()
        {
            var crawler = new RuiJiCrawler();
            var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/");

            var response = crawler.Request(request);
            var content  = response.Data.ToString();

            var block = new ExtractBlock();
            var s     = RuiJiBlockParser.ParserBase("css a:[href]").Selectors;

            block.TileSelector.Selectors.AddRange(s);
            var result = RuiJiExtractor.Extract(content, block);

            Assert.True(true);
        }
Beispiel #3
0
        protected List <string> ExtractFeedAddress(Snapshot snapshot)
        {
            var block = new ExtractBlock();

            block.TileSelector.Selectors.Add(new CssSelector("a", "href"));

            if (!string.IsNullOrEmpty(snapshot.Expression))
            {
                block.TileSelector.Selectors.Clear();

                var parser = new RuiJiParser();

                var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors;
                block.TileSelector.Selectors.AddRange(s);
            }

            var result  = RuiJiExtractor.Extract(snapshot.Content, block);
            var results = new List <string>();

            if (result.Tiles != null)
            {
                foreach (var item in result.Tiles)
                {
                    var href = item.Content.ToString();
                    if (href.Contains("#"))
                    {
                        href = href.Substring(0, href.IndexOf('#'));
                    }
                    if (Uri.IsWellFormedUriString(href, UriKind.Relative))
                    {
                        href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString();
                    }
                    results.Add(href);
                }
            }

            return(results.Distinct().ToList());
        }