public void TestExtractMeta() { var crawler = new RuiJiCrawler(); var request = new Request("https://my.oschina.net/zhupingqi/blog/1826317"); var response = crawler.Request(request); var content = response.Data.ToString(); var parser = new RuiJiParser(); var eb = parser.ParseExtract(@" [meta] #title css h1.header:text #author css div.blog-meta .avatar + span:text #date css div.blog-meta > div.item:first:text regS /发布于/ 1 #words_i css div.blog-meta > div.item:eq(1):text regS / / 1 #content css #articleContent:html" ); var result = RuiJiExtractor.Extract(content, eb.Result); Assert.True(true); }
public void TestAdvExpression1() { var parser = new RuiJiParser(); parser.ParseFile(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "expression_address.txt")); Assert.True(true); }
public void TestExtract2() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.oschina.net/blog"); var response = crawler.Request(request); var content = response.Data.ToString(); var parser = new RuiJiParser(); var eb = parser.ParseExtract("css a.blog-title-link:[href]\nexp https://my.oschina.net/*/blog/*"); var result = RuiJiExtractor.Extract(content, eb.Result); Assert.True(true); }
public List <string> ExtractAddress(FeedSnapshot feed) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (feed.UseBlock) { if (!string.IsNullOrEmpty(feed.BlockExpression)) { block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression); } } else { if (!string.IsNullOrEmpty(feed.RuiJiExpression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors; block.TileSelector.Selectors.AddRange(s); } } var result = RuiJiExtractor.Extract(feed.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
protected List <string> ExtractFeedAddress(Snapshot snapshot) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (!string.IsNullOrEmpty(snapshot.Expression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors; block.TileSelector.Selectors.AddRange(s); } var result = RuiJiExtractor.Extract(snapshot.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public void TestExtractTile() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/archives/category/tech/bigdata"); var response = crawler.Request(request); var content = response.Data.ToString(); var parser = new RuiJiParser(); var eb = parser.ParseExtract(@"[tile] css article:html [meta] #title css .entry-header:text #summary css .entry-header + p:text ex /Read more »/ -e" ); var result = RuiJiExtractor.Extract(content, eb.Result); Assert.True(true); }
protected override List <FeedRequest> GetRequests() { Logger.GetLogger("").Info("start get feed"); try { var requests = new List <FeedRequest>(); var compile = new UrlCompile(); var files = Directory.GetFiles(jobPath); foreach (var file in files) { var extension = Path.GetExtension(file).ToLower(); if (extension != ".feed") { continue; } var parser = new RuiJiParser(); var result = parser.ParseFile(file); if (result) { var request = parser.GetResult <Request>().Result; var setting = parser.GetResult <FeedSetting>().Result; if (request == null || setting == null) { continue; } var addrs = compile.GetResult(request.Uri.ToString()); for (int i = 0; i < addrs.Length; i++) { var addr = addrs[i].ToString(); var r = request.Clone() as Request; r.Uri = new Uri(addr); setting.Id += "_" + i; r.Tag = JsonConvert.SerializeObject(setting); var fr = new FeedRequest(); fr.Request = r; fr.Setting = setting; fr.Expression = parser.GetResult <ExtractBlock>().Expression; requests.Add(fr); } } } return(requests); } catch (Exception ex) { Logger.GetLogger("").Info("get feed error " + ex.Message); return(new List <FeedRequest>()); } }