public void TestPaging2() { var crawler = new RuiJiCrawler(); var request = new Request("https://3w.huanqiu.com/a/4e2d56fd7f51/7DHitRASkPC?p=1&agt=8"); var response = crawler.Request(request); var content = response.Data.ToString(); var exp = @" [meta] #title css h1.a-title #date_dt css .time:text #content css .a-con:ohtml [paging] css .a-page css a[href]"; var block = RuiJiBlockParser.ParserBlock(exp); var result = RuiJiExtractor.Extract(content, block); if (result.Paging != null && result.Paging.Count > 0 && result.Metas != null && result.Metas.ContainsKey("content")) { result = PagingExtractor.MergeContent(request.Uri, result, block); } Assert.True(true); }
public void TestPaging() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.kuaidaili.com/free/inha/1/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiBlockParser.ParserBlock(@" [tile] css table.table-bordered tr:gt(0):ohtml [meta] #ip css td[data-title='IP']:text # port css td[data-title='PORT']:text [paging] css #listnav a:[href] "); var result = RuiJiExtractor.Extract(content, s); Assert.True(true); }
public object TestFeed(FeedModel feed, [FromUri] bool down, [FromUri] bool debug = false) { try { var compile = new UrlCompile(); var addrs = compile.GetResult(feed.Address); var results = new List <ExtractResult>(); foreach (var addr in addrs) { feed.Address = addr.ToString(); var job = new FeedJob(); var snap = job.DoTask(feed, false); if (string.IsNullOrEmpty(feed.RuiJiExpression)) { results.Add(new ExtractResult()); continue; } var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression); var result = RuiJiExtractor.Extract(snap.Content, block); if (!debug) { CrawlTaskFunc.ClearContent(result); } if (down) { var s = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download")); var files = result.Content.ToString().Replace("\r\n", "\n").Split('\n'); foreach (var file in files) { if (!string.IsNullOrEmpty(file) && Uri.IsWellFormedUriString(file, UriKind.Absolute)) { var res = Crawler.Request(file); var c = new DownloadContentModel(); c.Url = file.Trim(); c.IsRaw = res.IsRaw; c.Data = res.Data; s.Insert(c); } } } results.Add(result); } return(results); } catch (Exception ex) { return(ex); } }
public object MatchUrlRule(string url, bool useBlock = false) { if (useBlock) { return(RuleLiteDb.Match(url).Select(m => new ExtractFeatureBlock(JsonConvert.DeserializeObject <ExtractBlock>(m.BlockExpression), m.Feature)).ToList()); } else { return(RuleLiteDb.Match(url).Select(m => new ExtractFeatureBlock(RuiJiBlockParser.ParserBlock(m.RuiJiExpression), m.Feature)).ToList()); } }
public void TestMeta() { var metas = @" #css css h4 a[href] -r css h4:ohtml css h4:html -r css h4 a:text #exclude ex /:/ -b ex /\-e/ -e ex /\-/ -a ex /[\d]*/ #expression exp ????/??/?? ??:??:??* exp datetime_?? -r #regex reg /[\d]*/ reg /aa([\d]*)sf/ 0 1 reg /aa([\d]*)sf/ -r #regexReplace regR />>/ > #regexSplit regS /aaa/ 2 3 5 regS /aaa/ 2 3 5 -r #textRange text /a\naa/ /b\tbb/ text /aaa/ /bbb/ -r #xpath xpath ladkfeio xpath dlqwekrjl -r #jsonPath jpath dlsldf.kljs jpath dlkejl -r "; var m = RuiJiBlockParser.ParserMeta(metas); Assert.True(m.Count > 0); }
public void TestReg() { var exp = @" [block] [blocks] @block1 @block2 [tile] css img [meta] #title css img:[title] regR /aabbcc/ regR /aabbcc/ 123 #src css img:[src] reg /aabbcc/ reg /aabbcc/ 1 reg /aabbcc/ 1 2 regS /aabbcc/ regS /aabbcc/ 1 regS /aabbcc/ 1 2 [paging] css #listnav a:[href] [paging] css #listnav a:[href] [block] #block1 css .list1 [block] #block2 css .list2 "; var m = RuiJiBlockParser.ParserBlock(exp); var j = JsonConvert.SerializeObject(m); exp = Converter.ToExpression(m); Assert.True(m.Metas.Count > 0); }
public void TestExpressionType() { var block = @" [block] #name_dda_ee css .entry-content:html [blocks] @block1 @block2 [tile] #aa_l css a:ohtml [meta] #time_dt css time:text [meta] #time_dt css time:text #words_i css .author:text #score_d css .entry-title:text #score_1_f css .entry-content:html #hasLink_b css h4 a:[href] -r [block] #block1 css .list1 [block] #block2 css .list2 "; var m = RuiJiBlockParser.ParserBlock(block); Assert.True(m.Metas.Count > 0); }
public List <string> ExtractAddress(FeedSnapshot feed) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (feed.UseBlock) { if (!string.IsNullOrEmpty(feed.BlockExpression)) { block = JsonConvert.DeserializeObject <ExtractBlock>(feed.BlockExpression); } } else { if (!string.IsNullOrEmpty(feed.RuiJiExpression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(feed.RuiJiExpression).Selectors; block.TileSelector.Selectors.AddRange(s); } } var result = RuiJiExtractor.Extract(feed.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(feed.Url), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public void TestExtract() { var crawler = new RuiJiCrawler(); var request = new Request("http://www.ruijihg.com/%e5%bc%80%e5%8f%91/"); var response = crawler.Request(request); var content = response.Data.ToString(); var block = new ExtractBlock(); var s = RuiJiBlockParser.ParserBase("css a:[href]").Selectors; block.TileSelector.Selectors.AddRange(s); var result = RuiJiExtractor.Extract(content, block); Assert.True(true); }
public void TestBlock() { var block = @" [block] #name css .entry-content:html [blocks] @block1 @block2 [tile] #aa css a:ohtml [meta] #time css time:text [meta] #time css time:text #author css .author:text #title css .entry-title:text #content css .entry-content:html #link css h4 a[href] -r [block] #block1 css .list1 [block] #block2 css .list2 "; var m = RuiJiBlockParser.ParserBlock(block); Assert.True(m.Metas.Count > 0); }
public void TestJC2() { var exp = @" [block] [blocks] @block1 @block2 [tile] css img [meta] #title css img:[title] proc aabbcc #src css img:[src] [paging] css #listnav a:[href] [paging] css #listnav a:[href] [block] #block1 css .list1 [block] #block2 css .list2 "; var b = RuiJiBlockParser.ParserBlock(exp); var json = JsonConvert.SerializeObject(b); b = JsonConvert.DeserializeObject <ExtractBlock>(json); exp = Converter.ToExpression(b); Assert.True(b.Metas.Count > 0); }
public void TestJsonPExtract() { var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1"; var f = new UrlCompile(); //url = f.Compile(url); var c = new RuiJiCrawler(); var response = c.Request(new Request(url)); var expression = @" reg /jsonp[\d]+?\((.*)\)/ 1 jpath $..url "; var b = RuiJiBlockParser.ParserBlock(expression); var result = RuiJiExtractor.Extract(response.Data.ToString(), b); Assert.IsTrue(result.Content.ToString().Length > 0); }
public object TestRule([FromBody] RuleModel rule, bool debug = false) { var request = new Request(rule.Url); request.Method = rule.Method; request.RunJS = (rule.RunJS == Status.ON); if (request.RunJS) { request.WaitDom = request.WaitDom; } var response = Crawler.Request(request); if (response != null && response.Data != null) { var content = response.Data.ToString(); var block = RuiJiBlockParser.ParserBlock(rule.RuiJiExpression); var r = new ExtractRequest(); r.Content = content; r.Blocks = new List <ExtractFeatureBlock> { new ExtractFeatureBlock(block, rule.Feature) }; var results = Extractor.Extract(r); var result = results.OrderByDescending(m => m.Metas.Count).FirstOrDefault(); if (result != null && result.Paging != null && result.Paging.Count > 0 && result.Metas != null && result.Metas.ContainsKey("content")) { result = PagingExtractor.MergeContent(new Uri(rule.Url), result, block); } if (!debug) { CrawlTaskFunc.ClearContent(result); } return(result); } return(new { }); }
public void TestPaging() { var crawler = new RuiJiCrawler(); var request = new Request("https://www.kuaidaili.com/free/inha/10"); var response = crawler.Request(request); var content = response.Data.ToString(); var exp = @" [tile] css table.table-bordered tr:gt(0):ohtml [meta] #ip css td[data-title='IP']:text #port css td[data-title='PORT']:text [paging] css #listnav a[href]"; var block = RuiJiBlockParser.ParserBlock(exp); var result = RuiJiExtractor.Extract(content, block); if (result.Paging != null && result.Paging.Count > 0 && result.Tiles != null) { var storage = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download")); PagingExtractor.CrawlPage(request.Uri, result, block, (u, res) => { var c = new DownloadContentModel(); c.Url = u.AbsolutePath.Trim(); c.IsRaw = false; c.Data = JsonConvert.SerializeObject(res.Tiles); storage.Insert(c); }, int.MaxValue); } Assert.True(true); }
/// <summary> /// constructor /// </summary> /// <param name="block">extract block</param> /// <param name="feature">extract feature</param> public ExtractFeatureBlock(ExtractBlock block, string feature = "") { this.Block = block; var selectors = new List <ISelector>(); if (string.IsNullOrEmpty(feature)) { return; } var sp = feature.Replace("\r\n", "\n").Split('\n'); foreach (var s in sp) { var selector = RuiJiBlockParser.ParserSelector(s); selectors.Add(selector); } ExtractFeature = new ExtractFeature(); ExtractFeature.Features = selectors; }
public void TestBlock() { var block = @" [block] [blocks] @block1 @block2 [tile] css img [meta] #title css img:[title] #src css img:[src] [paging] css #listnav a:[href] [paging] css #listnav a:[href] [block] #block1 css .list1 [block] #block2 css .list2 "; var m = RuiJiBlockParser.ParserBlock(block); var exp = Converter.ToExpression(m); Assert.True(m.Metas.Count > 0); }
protected List <string> ExtractFeedAddress(Snapshot snapshot) { var block = new ExtractBlock(); block.TileSelector.Selectors.Add(new CssSelector("a", "href")); if (!string.IsNullOrEmpty(snapshot.Expression)) { block.TileSelector.Selectors.Clear(); var parser = new RuiJiParser(); var s = RuiJiBlockParser.ParserBase(snapshot.Expression).Selectors; block.TileSelector.Selectors.AddRange(s); } var result = RuiJiExtractor.Extract(snapshot.Content, block); var results = new List <string>(); if (result.Tiles != null) { foreach (var item in result.Tiles) { var href = item.Content.ToString(); if (href.Contains("#")) { href = href.Substring(0, href.IndexOf('#')); } if (Uri.IsWellFormedUriString(href, UriKind.Relative)) { href = new Uri(new Uri(snapshot.RequestUrl), href).AbsoluteUri.ToString(); } results.Add(href); } } return(results.Distinct().ToList()); }
public object Run(object t, ParallelTask task) { var model = t as CrawlTaskModel; var results = new List <object>(); var reporter = task.Progress as IProgress <string>; reporter.Report("正在读取Feed记录"); var feed = FeedLiteDb.GetFeed(model.FeedId); reporter.Report("正在下载 Feed"); var compile = new UrlCompile(); var addrs = compile.GetResult(feed.Address); foreach (var addr in addrs) { feed.Address = addr.ToString(); var job = new FeedJob(); var snap = job.DoTask(feed, false); reporter.Report("Feed 下载完成"); var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression); var feedResult = RuiJiExtractor.Extract(snap.Content, block); results.Add(feedResult); reporter.Report("正在提取Feed地址"); var j = new FeedExtractJob(); var urls = j.ExtractAddress(snap); reporter.Report("Feed地址提取完成"); if (!string.IsNullOrEmpty(snap.RuiJiExpression)) { foreach (var url in urls) { reporter.Report("正在提取地址 " + url); var result = Cooperater.GetResult(url); if (result != null) { var cm = new ContentModel(); cm.Id = model.FeedId; cm.Url = url; cm.Metas = result.Metas; cm.CDate = DateTime.Now; results.Add(cm); } } } reporter.Report("计算完成"); if (!model.IncludeContent) { results.ForEach((m) => { ClearContent(m); }); } } return(results); }
public void TestMeta() { var metas = @" [block] css #content_left [tile] css .result [meta] #title css h3.c-title:text #src css h3.c-title a:[href] #media css .c-author:text regS /\s+/ 0 #date css .c-author:text regS /\s+/ 1 #summary css .c-summary css .c-info -r css .c-author:text -r #text css .c-summary:text text /bmw/ /bmw/ #regS css .c-summary regS /bmw/ 1 #regR css .c-summary regR /bmw/ aabbcc #ex css .c-summary ex /bmw/ -b #exp css .c-summary exp http://*.baidu.com/* /\s+/ #jpath css .c-summary jpath ..url #xpath css .c-summary xpath /aa/bb/c:[data] xpath /aa/bb/c:text xpath /aa/bb/c:xml xpath /aa/bb/c #clear css .c-summary clear span em "; var m = RuiJiBlockParser.ParserMeta(metas); Assert.True(m.Count > 0); }