public List <string> ExtractAddress(FeedSnapshot feed) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (feed.UseBlock) { if (!string.IsNullOrEmpty(feed.BlockExpression)) { block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression); } } else { if (!string.IsNullOrEmpty(feed.RuiJiExpression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors; block.TileSelector.Selectors.AddRange(s); } } var result = RuiJiExtractor.Extract(feed.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public void TestExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiBlockParser.ParserBase("css a:[href]").Selectors; block.TileSelector.Selectors.AddRange(s); var result = RuiJiExtractor.Extract(content, block); Assert.True(true); }
protected List <string> ExtractFeedAddress(Snapshot snapshot) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (!string.IsNullOrEmpty(snapshot.Expression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors; block.TileSelector.Selectors.AddRange(s); } var result = RuiJiExtractor.Extract(snapshot.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }