public void TestPaging() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.kuaidaili.com/free/inha/1/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiExpression.PaserBlock(@" [tile] css table.table-bordered tr:gt(0):ohtml [meta] #ip css td[data-title='IP']:text # port css td[data-title='PORT']:text [paging] css #listnav a[href] "); var result = RuiJiExtracter.Extract(content, s); Assert.IsTrue(true); }
public void TestBlock() { var block = @" #name css .entry-content:html [blocks] @block1 @block2 [tile] #aa css a:ohtml [meta] #time css time:text [meta] #time css time:text #author css .author:text #title css .entry-title:text #content css .entry-content:html #link css h4 a[href] -r"; block = @" [meta] #time css time:text #author css .author:text #title css .entry-title:text #content css .entry-content:html #link css h4 a[href] -r"; var m = RuiJiExpression.PaserBlock(block); Assert.IsTrue(m.Metas.Count > 0); }
public object Run(object t, ParallelTask task) { var model = t as CrawlTaskModel; var results = new List <ExtractResult>(); var reporter = task.Progress as IProgress <string>; reporter.Report("正在读取Feed记录"); var feed = FeedLiteDb.GetFeed(model.FeedId); reporter.Report("正在下载 Feed"); var compile = new CompileFeedAddress(); feed.Address = compile.Compile(feed.Address); var job = new FeedJob(); var snap = job.DoTask(feed, false); reporter.Report("Feed 下载完成"); var block = RuiJiExpression.ParserBlock(feed.RuiJiExpression); var feedResult = RuiJiExtracter.Extract(snap.Content, block); results.Add(feedResult); reporter.Report("正在提取Feed地址"); var j = new FeedExtractJob(); var urls = j.ExtractAddress(snap); reporter.Report("Feed地址提取完成"); foreach (var url in urls) { reporter.Report("正在提取地址 " + url); var r = ContentQueue.Instance.Extract(url); results.AddRange(r); } reporter.Report("计算完成"); if (!model.IncludeContent) { results.ForEach((m) => { ClearContent(m); }); } return(results); }
public void TestMeta() { var metas = @" #css css h4 a[href] -r css h4:ohtml css h4:html -r css h4 a:text #exclude ex /:/ -b ex /\-e/ -e ex /\-/ -a ex /[\d]*/ #expression exp ????/??/?? ??:??:??* exp datetime_?? -r #regex reg /[\d]*/ reg /aa([\d]*)sf/ 0 1 reg /aa([\d]*)sf/ -r #regexReplace regR /aaaa/ dddd/ #regexSplit regS /aaa/ 2 3 5 regS /aaa/ 2 3 5 -r #textRange text /a\naa/ /b\tbb/ text /aaa/ /bbb/ -r #xpath xpath ladkfeio xpath dlqwekrjl -r #jsonPath jpath dlsldf.kljs jpath dlkejl -r "; var m = RuiJiExpression.PaserMeta(metas); Assert.IsTrue(m.Count > 0); }
public void TestExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiExpression.ParserBase("css a[href]").Selectors; block.TileSelector.Selectors.AddRange(s); var result = RuiJiExtracter.Extract(content, block); Assert.IsTrue(true); }
public List <string> ExtractAddress(FeedSnapshot feed) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (feed.UseBlock) { if (!string.IsNullOrEmpty(feed.BlockExpression)) { block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression); } } else { if (!string.IsNullOrEmpty(feed.RuiJiExpression)) { block.TileSelector.Selectors.Clear(); var s = RuiJiExpression.ParserBase(feed.RuiJiExpression).Selectors; block.TileSelector.Selectors.AddRange(s); } } var result = RuiJiExtracter.Extract(feed.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content; if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public object UrlRule(string url, bool useBlock = false) { var node = ServerManager.Get(Request.RequestUri.Authority); if (node.NodeType == Node.NodeTypeEnum.FEEDPROXY) { if (useBlock) { return(RuleLiteDB.Match(url).Select(m => JsonConvert.DeserializeObject <ExtractBlock>(m.BlockExpression)).ToList()); } else { return(RuleLiteDB.Match(url).Select(m => RuiJiExpression.ParserBlock(m.RuiJiExpression)).ToList()); } } return(new { }); }
public void TestJsonPExtract() { var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1"; var f = new CompileFeedAddress(); url = f.Compile(url); var c = new RuiJiCrawler(); var response = c.Request(new Request(url)); var expression = @" reg /jsonp[\d]+?\((.*)\)/ 1 jpath $..url "; var b = RuiJiExpression.ParserBlock(expression); var result = RuiJiExtracter.Extract(response.Data.ToString(), b); Assert.IsTrue(result.Content.Length > 0); }