public void TestPaging() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.kuaidaili.com/free/inha/1/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiExpression.PaserBlock(@" [tile] css table.table-bordered tr:gt(0):ohtml [meta] #ip css td[data-title='IP']:text # port css td[data-title='PORT']:text [paging] css #listnav a[href] "); var result = RuiJiExtracter.Extract(content, s); Assert.IsTrue(true); }
public object Run(object t, ParallelTask task) { var model = t as CrawlTaskModel; var results = new List <ExtractResult>(); var reporter = task.Progress as IProgress <string>; reporter.Report("正在读取Feed记录"); var feed = FeedLiteDb.GetFeed(model.FeedId); reporter.Report("正在下载 Feed"); var compile = new CompileFeedAddress(); feed.Address = compile.Compile(feed.Address); var job = new FeedJob(); var snap = job.DoTask(feed, false); reporter.Report("Feed 下载完成"); var block = RuiJiExpression.ParserBlock(feed.RuiJiExpression); var feedResult = RuiJiExtracter.Extract(snap.Content, block); results.Add(feedResult); reporter.Report("正在提取Feed地址"); var j = new FeedExtractJob(); var urls = j.ExtractAddress(snap); reporter.Report("Feed地址提取完成"); foreach (var url in urls) { reporter.Report("正在提取地址 " + url); var r = ContentQueue.Instance.Extract(url); results.AddRange(r); } reporter.Report("计算完成"); if (!model.IncludeContent) { results.ForEach((m) => { ClearContent(m); }); } return(results); }
public void TestLocalExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); block.Selectors = new List <ISelector> { new CssSelector(".entry-content", CssTypeEnum.InnerHtml) }; block.TileSelector = new ExtractTile { Selectors = new List <ISelector> { new CssSelector(".pt-cv-content-item", CssTypeEnum.InnerHtml) } }; //block.TileSelector.Metas.AddMeta(new ExtractBase { // Name = "title", // Selectors = new List<ISelector> { // new CssSelector(".pt-cv-title") // } //}); //block.TileSelector.Metas.AddMeta(new ExtractBase //{ // Name = "url", // Selectors = new List<ISelector> { // new CssSelector(".pt-cv-readmore","href") // } //}); block.TileSelector.Metas.AddMeta("title", new List <ISelector> { new CssSelector(".pt-cv-title") }); block.TileSelector.Metas.AddMeta("url", new List <ISelector> { new CssSelector(".pt-cv-readmore", "href") }); var r = RuiJiExtracter.Extract(content, block); Assert.IsTrue(r.Content.Length > 0); Assert.IsTrue(r.Tiles.Count > 0); }
public ExtractResult Extract([FromBody] string json) { var node = ServerManager.Get(Request.RequestUri.Authority); var request = JsonConvert.DeserializeObject <ExtractRequest>(json); if (node.NodeType == Node.NodeTypeEnum.EXTRACTER) { var result = RuiJiExtracter.Extract(request.Content, request.Block); return(result); } else { return(Extracter.Extract(request)); } }
public void TestExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiExpression.ParserBase("css a[href]").Selectors; block.TileSelector.Selectors.AddRange(s); var result = RuiJiExtracter.Extract(content, block); Assert.IsTrue(true); }
public List <string> ExtractAddress(FeedSnapshot feed) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (feed.UseBlock) { if (!string.IsNullOrEmpty(feed.BlockExpression)) { block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression); } } else { if (!string.IsNullOrEmpty(feed.RuiJiExpression)) { block.TileSelector.Selectors.Clear(); var s = RuiJiExpression.ParserBase(feed.RuiJiExpression).Selectors; block.TileSelector.Selectors.AddRange(s); } } var result = RuiJiExtracter.Extract(feed.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content; if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public void TestJsonPExtract() { var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1"; var f = new CompileFeedAddress(); url = f.Compile(url); var c = new RuiJiCrawler(); var response = c.Request(new Request(url)); var expression = @" reg /jsonp[\d]+?\((.*)\)/ 1 jpath $..url "; var b = RuiJiExpression.ParserBlock(expression); var result = RuiJiExtracter.Extract(response.Data.ToString(), b); Assert.IsTrue(result.Content.Length > 0); }