public void TestPaging2() { var crawler = new RuiJiCrawler(); var request = new Request("https://3w.huanqiu.com/a/4e2d56fd7f51/7DHitRASkPC?p=1&agt=8"); var response = crawler.Request(request); var content = response.Data.ToString(); var exp = @" [meta] #title css h1.a-title #date_dt css .time:text #content css .a-con:ohtml [paging] css .a-page css a[href]"; var block = RuiJiBlockParser.ParserBlock(exp); var result = RuiJiExtractor.Extract(content, block); if (result.Paging != null && result.Paging.Count > 0 && result.Metas != null && result.Metas.ContainsKey("content")) { result = PagingExtractor.MergeContent(request.Uri, result, block); } Assert.True(true); }
public void TestExtractMeta() { var crawler = new RuiJiCrawler(); var request = new Request("https://my.oschina.net/zhupingqi/blog/1826317"); var response = crawler.Request(request); var content = response.Data.ToString(); var parser = new RuiJiParser(); var eb = parser.ParseExtract(@" [meta] #title css h1.header:text #author css div.blog-meta .avatar + span:text #date css div.blog-meta > div.item:first:text regS /发布于/ 1 #words_i css div.blog-meta > div.item:eq(1):text regS / / 1 #content css #articleContent:html" ); var result = RuiJiExtractor.Extract(content, eb.Result); Assert.True(true); }
public void TestPaging() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.kuaidaili.com/free/inha/1/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiBlockParser.ParserBlock(@" [tile] css table.table-bordered tr:gt(0):ohtml [meta] #ip css td[data-title='IP']:text # port css td[data-title='PORT']:text [paging] css #listnav a:[href] "); var result = RuiJiExtractor.Extract(content, s); Assert.True(true); }
public static List <ExtractResult> Extract(ExtractRequest request) { if (NodeConfigurationSection.Standalone) { var result = RuiJiExtractor.Extract(request); return(result); } else { var proxyUrl = ProxyManager.Instance.Elect(NodeProxyTypeEnum.FEEDPROXY); if (string.IsNullOrEmpty(proxyUrl)) { throw new Exception("no available Extractor proxy servers"); } proxyUrl = IPHelper.FixLocalUrl(proxyUrl); var client = new RestClient("http://" + proxyUrl); var restRequest = new RestRequest("api/ep/extract"); restRequest.Method = Method.POST; restRequest.JsonSerializer = new NewtonJsonSerializer(); var json = JsonConvert.SerializeObject(request); restRequest.AddJsonBody(json); restRequest.Timeout = 15000; var restResponse = client.Execute(restRequest); var response = JsonConvert.DeserializeObject <List <ExtractResult> >(restResponse.Content); return(response); } }
public object TestFeed(FeedModel feed, [FromUri] bool down, [FromUri] bool debug = false) { try { var compile = new UrlCompile(); var addrs = compile.GetResult(feed.Address); var results = new List <ExtractResult>(); foreach (var addr in addrs) { feed.Address = addr.ToString(); var job = new FeedJob(); var snap = job.DoTask(feed, false); if (string.IsNullOrEmpty(feed.RuiJiExpression)) { results.Add(new ExtractResult()); continue; } var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression); var result = RuiJiExtractor.Extract(snap.Content, block); if (!debug) { CrawlTaskFunc.ClearContent(result); } if (down) { var s = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download")); var files = result.Content.ToString().Replace("\r\n", "\n").Split('\n'); foreach (var file in files) { if (!string.IsNullOrEmpty(file) && Uri.IsWellFormedUriString(file, UriKind.Absolute)) { var res = Crawler.Request(file); var c = new DownloadContentModel(); c.Url = file.Trim(); c.IsRaw = res.IsRaw; c.Data = res.Data; s.Insert(c); } } } results.Add(result); } return(results); } catch (Exception ex) { return(ex); } }
public void TestLocalExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); block.Selectors = new List <ISelector> { new CssSelector(".entry-content", CssTypeEnum.INNERHTML) }; block.TileSelector = new ExtractTile { Selectors = new List <ISelector> { new CssSelector(".pt-cv-content-item", CssTypeEnum.INNERHTML) } }; //block.TileSelector.Metas.AddMeta(new ExtractBase { // Name = "title", // Selectors = new List<ISelector> { // new CssSelector(".pt-cv-title") // } //}); //block.TileSelector.Metas.AddMeta(new ExtractBase //{ // Name = "url", // Selectors = new List<ISelector> { // new CssSelector(".pt-cv-readmore","href") // } //}); block.TileSelector.Metas.AddMeta("title", new List <ISelector> { new CssSelector(".pt-cv-title") }); block.TileSelector.Metas.AddMeta("url", new List <ISelector> { new CssSelector(".pt-cv-readmore", "href") }); var r = RuiJiExtractor.Extract(content, block); Assert.IsTrue(r.Content.ToString().Length > 0); Assert.IsTrue(r.Tiles.Count > 0); }
public void TestExtract2() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.oschina.net/blog"); var response = crawler.Request(request); var content = response.Data.ToString(); var parser = new RuiJiParser(); var eb = parser.ParseExtract("css a.blog-title-link:[href]\nexp https://my.oschina.net/*/blog/*"); var result = RuiJiExtractor.Extract(content, eb.Result); Assert.True(true); }
public List <ExtractResult> Extract([FromBody] string json) { var node = ServerManager.Get(Request.RequestUri.Authority); var request = JsonConvert.DeserializeObject <ExtractRequest>(json); if (node.NodeType == Node.NodeTypeEnum.Extractor) { var result = RuiJiExtractor.Extract(request); return(result); } else { return(Extractor.Extract(request)); } }
public List <string> ExtractAddress(FeedSnapshot feed) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (feed.UseBlock) { if (!string.IsNullOrEmpty(feed.BlockExpression)) { block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression); } } else { if (!string.IsNullOrEmpty(feed.RuiJiExpression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors; block.TileSelector.Selectors.AddRange(s); } } var result = RuiJiExtractor.Extract(feed.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public void TestExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiBlockParser.ParserBase("css a:[href]").Selectors; block.TileSelector.Selectors.AddRange(s); var result = RuiJiExtractor.Extract(content, block); Assert.True(true); }
public void TestJsonPExtract() { var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1"; var f = new UrlCompile(); //url = f.Compile(url); var c = new RuiJiCrawler(); var response = c.Request(new Request(url)); var expression = @" reg /jsonp[\d]+?\((.*)\)/ 1 jpath $..url "; var b = RuiJiBlockParser.ParserBlock(expression); var result = RuiJiExtractor.Extract(response.Data.ToString(), b); Assert.IsTrue(result.Content.ToString().Length > 0); }
public void TestPaging() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.kuaidaili.com/free/inha/10"); var response = crawler.Request(request); var content = response.Data.ToString(); var exp = @" [tile] css table.table-bordered tr:gt(0):ohtml [meta] #ip css td[data-title='IP']:text #port css td[data-title='PORT']:text [paging] css #listnav a[href]"; var block = RuiJiBlockParser.ParserBlock(exp); var result = RuiJiExtractor.Extract(content, block); if (result.Paging != null && result.Paging.Count > 0 && result.Tiles != null) { var storage = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download")); PagingExtractor.CrawlPage(request.Uri, result, block, (u, res) => { var c = new DownloadContentModel(); c.Url = u.AbsolutePath.Trim(); c.IsRaw = false; c.Data = JsonConvert.SerializeObject(res.Tiles); storage.Insert(c); }, int.MaxValue); } Assert.True(true); }
protected List <string> ExtractFeedAddress(Snapshot snapshot) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (!string.IsNullOrEmpty(snapshot.Expression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors; block.TileSelector.Selectors.AddRange(s); } var result = RuiJiExtractor.Extract(snapshot.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public void TestExtractTile() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/archives/category/tech/bigdata"); var response = crawler.Request(request); var content = response.Data.ToString(); var parser = new RuiJiParser(); var eb = parser.ParseExtract(@"[tile] css article:html [meta] #title css .entry-header:text #summary css .entry-header + p:text ex /Read more »/ -e" ); var result = RuiJiExtractor.Extract(content, eb.Result); Assert.True(true); }
public object Run(object t, ParallelTask task) { var model = t as CrawlTaskModel; var results = new List <object>(); var reporter = task.Progress as IProgress <string>; reporter.Report("正在读取Feed记录"); var feed = FeedLiteDb.GetFeed(model.FeedId); reporter.Report("正在下载 Feed"); var compile = new UrlCompile(); var addrs = compile.GetResult(feed.Address); foreach (var addr in addrs) { feed.Address = addr.ToString(); var job = new FeedJob(); var snap = job.DoTask(feed, false); reporter.Report("Feed 下载完成"); var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression); var feedResult = RuiJiExtractor.Extract(snap.Content, block); results.Add(feedResult); reporter.Report("正在提取Feed地址"); var j = new FeedExtractJob(); var urls = j.ExtractAddress(snap); reporter.Report("Feed地址提取完成"); if (!string.IsNullOrEmpty(snap.RuiJiExpression)) { foreach (var url in urls) { reporter.Report("正在提取地址 " + url); var result = Cooperater.GetResult(url); if (result != null) { var cm = new ContentModel(); cm.Id = model.FeedId; cm.Url = url; cm.Metas = result.Metas; cm.CDate = DateTime.Now; results.Add(cm); } } } reporter.Report("计算完成"); if (!model.IncludeContent) { results.ForEach((m) => { ClearContent(m); }); } } return(results); }