public async Task Execute(IJobExecutionContext context) { if (!IsRunning) { IsRunning = true; baseUrl = context.JobDetail.JobDataMap.Get("baseUrl").ToString(); proxyUrl = context.JobDetail.JobDataMap.Get("proxyUrl").ToString(); feedNode = context.JobDetail.JobDataMap.Get("node") as FeedNode; Logger.GetLogger(baseUrl).Info("start feed job execute"); var task = Task.Factory.StartNew(() => { var feeds = GetFeedJobs(baseUrl, proxyUrl, feedNode); var compile = new UrlCompile(); var stpStartInfo = new STPStartInfo { IdleTimeout = 3000, MaxWorkerThreads = 8, MinWorkerThreads = 0 }; var pool = new SmartThreadPool(stpStartInfo); var waits = new List <IWorkItemResult>(); foreach (var feed in feeds) { var addrs = compile.GetResult(feed.Address); Logger.GetLogger(baseUrl).Info("compile address " + feed.Address + " result " + string.Join(",", addrs)); foreach (var addr in addrs) { feed.Address = addr.ToString(); var item = pool.QueueWorkItem((u) => { DoTask(u, true); }, feed); waits.Add(item); } } SmartThreadPool.WaitAll(waits.ToArray()); pool.Shutdown(true, 1000); pool.Dispose(); pool = null; waits.Clear(); }); await task; IsRunning = false; Logger.GetLogger(baseUrl).Info("end feed job execute"); } }
public object TestFeed(FeedModel feed, [FromUri] bool down, [FromUri] bool debug = false) { try { var compile = new UrlCompile(); var addrs = compile.GetResult(feed.Address); var results = new List <ExtractResult>(); foreach (var addr in addrs) { feed.Address = addr.ToString(); var job = new FeedJob(); var snap = job.DoTask(feed, false); if (string.IsNullOrEmpty(feed.RuiJiExpression)) { results.Add(new ExtractResult()); continue; } var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression); var result = RuiJiExtractor.Extract(snap.Content, block); if (!debug) { CrawlTaskFunc.ClearContent(result); } if (down) { var s = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download")); var files = result.Content.ToString().Replace("\r\n", "\n").Split('\n'); foreach (var file in files) { if (!string.IsNullOrEmpty(file) && Uri.IsWellFormedUriString(file, UriKind.Absolute)) { var res = Crawler.Request(file); var c = new DownloadContentModel(); c.Url = file.Trim(); c.IsRaw = res.IsRaw; c.Data = res.Data; s.Insert(c); } } } results.Add(result); } return(results); } catch (Exception ex) { return(ex); } }
public void TestMethod1() { var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page={# page(1,10) #}&&start={# limit(1,5,2) #}"; var f = new UrlCompile(); var urls = f.GetResult(url); Assert.IsTrue(true); }
public void TestJsonPExtract() { var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1"; var f = new UrlCompile(); //url = f.Compile(url); var c = new RuiJiCrawler(); var response = c.Request(new Request(url)); var expression = @" reg /jsonp[\d]+?\((.*)\)/ 1 jpath $..url "; var b = RuiJiBlockParser.ParserBlock(expression); var result = RuiJiExtractor.Extract(response.Data.ToString(), b); Assert.IsTrue(result.Content.ToString().Length > 0); }
protected override List <FeedRequest> GetRequests() { Logger.GetLogger("").Info("start get feed"); try { var requests = new List <FeedRequest>(); var compile = new UrlCompile(); var files = Directory.GetFiles(jobPath); foreach (var file in files) { var extension = Path.GetExtension(file).ToLower(); if (extension != ".feed") { continue; } var parser = new RuiJiParser(); var result = parser.ParseFile(file); if (result) { var request = parser.GetResult <Request>().Result; var setting = parser.GetResult <FeedSetting>().Result; if (request == null || setting == null) { continue; } var addrs = compile.GetResult(request.Uri.ToString()); for (int i = 0; i < addrs.Length; i++) { var addr = addrs[i].ToString(); var r = request.Clone() as Request; r.Uri = new Uri(addr); setting.Id += "_" + i; r.Tag = JsonConvert.SerializeObject(setting); var fr = new FeedRequest(); fr.Request = r; fr.Setting = setting; fr.Expression = parser.GetResult <ExtractBlock>().Expression; requests.Add(fr); } } } return(requests); } catch (Exception ex) { Logger.GetLogger("").Info("get feed error " + ex.Message); return(new List <FeedRequest>()); } }
public object Run(object t, ParallelTask task) { var model = t as CrawlTaskModel; var results = new List <object>(); var reporter = task.Progress as IProgress <string>; reporter.Report("正在读取Feed记录"); var feed = FeedLiteDb.GetFeed(model.FeedId); reporter.Report("正在下载 Feed"); var compile = new UrlCompile(); var addrs = compile.GetResult(feed.Address); foreach (var addr in addrs) { feed.Address = addr.ToString(); var job = new FeedJob(); var snap = job.DoTask(feed, false); reporter.Report("Feed 下载完成"); var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression); var feedResult = RuiJiExtractor.Extract(snap.Content, block); results.Add(feedResult); reporter.Report("正在提取Feed地址"); var j = new FeedExtractJob(); var urls = j.ExtractAddress(snap); reporter.Report("Feed地址提取完成"); if (!string.IsNullOrEmpty(snap.RuiJiExpression)) { foreach (var url in urls) { reporter.Report("正在提取地址 " + url); var result = Cooperater.GetResult(url); if (result != null) { var cm = new ContentModel(); cm.Id = model.FeedId; cm.Url = url; cm.Metas = result.Metas; cm.CDate = DateTime.Now; results.Add(cm); } } } reporter.Report("计算完成"); if (!model.IncludeContent) { results.ForEach((m) => { ClearContent(m); }); } } return(results); }