示例#1
0
        public async Task Execute(IJobExecutionContext context)
        {
            if (!IsRunning)
            {
                IsRunning = true;
                baseUrl   = context.JobDetail.JobDataMap.Get("baseUrl").ToString();
                proxyUrl  = context.JobDetail.JobDataMap.Get("proxyUrl").ToString();
                feedNode  = context.JobDetail.JobDataMap.Get("node") as FeedNode;

                Logger.GetLogger(baseUrl).Info("start feed job execute");

                var task = Task.Factory.StartNew(() =>
                {
                    var feeds   = GetFeedJobs(baseUrl, proxyUrl, feedNode);
                    var compile = new UrlCompile();

                    var stpStartInfo = new STPStartInfo
                    {
                        IdleTimeout      = 3000,
                        MaxWorkerThreads = 8,
                        MinWorkerThreads = 0
                    };

                    var pool  = new SmartThreadPool(stpStartInfo);
                    var waits = new List <IWorkItemResult>();

                    foreach (var feed in feeds)
                    {
                        var addrs = compile.GetResult(feed.Address);

                        Logger.GetLogger(baseUrl).Info("compile address " + feed.Address + " result " + string.Join(",", addrs));

                        foreach (var addr in addrs)
                        {
                            feed.Address = addr.ToString();

                            var item = pool.QueueWorkItem((u) =>
                            {
                                DoTask(u, true);
                            }, feed);

                            waits.Add(item);
                        }
                    }

                    SmartThreadPool.WaitAll(waits.ToArray());

                    pool.Shutdown(true, 1000);
                    pool.Dispose();
                    pool = null;
                    waits.Clear();
                });

                await task;

                IsRunning = false;

                Logger.GetLogger(baseUrl).Info("end feed job execute");
            }
        }
示例#2
0
        public object TestFeed(FeedModel feed, [FromUri] bool down, [FromUri] bool debug = false)
        {
            try
            {
                var compile = new UrlCompile();
                var addrs   = compile.GetResult(feed.Address);
                var results = new List <ExtractResult>();

                foreach (var addr in addrs)
                {
                    feed.Address = addr.ToString();
                    var job  = new FeedJob();
                    var snap = job.DoTask(feed, false);

                    if (string.IsNullOrEmpty(feed.RuiJiExpression))
                    {
                        results.Add(new ExtractResult());
                        continue;
                    }

                    var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression);

                    var result = RuiJiExtractor.Extract(snap.Content, block);

                    if (!debug)
                    {
                        CrawlTaskFunc.ClearContent(result);
                    }

                    if (down)
                    {
                        var s = new FileStorage(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "www", "download"));

                        var files = result.Content.ToString().Replace("\r\n", "\n").Split('\n');
                        foreach (var file in files)
                        {
                            if (!string.IsNullOrEmpty(file) && Uri.IsWellFormedUriString(file, UriKind.Absolute))
                            {
                                var res = Crawler.Request(file);
                                var c   = new DownloadContentModel();
                                c.Url   = file.Trim();
                                c.IsRaw = res.IsRaw;
                                c.Data  = res.Data;

                                s.Insert(c);
                            }
                        }
                    }

                    results.Add(result);
                }

                return(results);
            }
            catch (Exception ex)
            {
                return(ex);
            }
        }
示例#3
0
        public void TestMethod1()
        {
            var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page={# page(1,10) #}&&start={# limit(1,5,2) #}";

            var f    = new UrlCompile();
            var urls = f.GetResult(url);

            Assert.IsTrue(true);
        }
示例#4
0
        public void TestJsonPExtract()
        {
            var url = "http://app.cannews.com.cn/roll.php?do=query&callback=jsonp1475197217819&_={# ticks() #}&date={# now(\"yyyy-MM-dd\") #}&size=20&page=1";

            var f = new UrlCompile();
            //url = f.Compile(url);

            var c        = new RuiJiCrawler();
            var response = c.Request(new Request(url));

            var expression = @"
reg /jsonp[\d]+?\((.*)\)/ 1
jpath $..url
";
            var b          = RuiJiBlockParser.ParserBlock(expression);
            var result     = RuiJiExtractor.Extract(response.Data.ToString(), b);

            Assert.IsTrue(result.Content.ToString().Length > 0);
        }
示例#5
0
        protected override List <FeedRequest> GetRequests()
        {
            Logger.GetLogger("").Info("start get feed");

            try
            {
                var requests = new List <FeedRequest>();
                var compile  = new UrlCompile();
                var files    = Directory.GetFiles(jobPath);

                foreach (var file in files)
                {
                    var extension = Path.GetExtension(file).ToLower();
                    if (extension != ".feed")
                    {
                        continue;
                    }

                    var parser = new RuiJiParser();
                    var result = parser.ParseFile(file);

                    if (result)
                    {
                        var request = parser.GetResult <Request>().Result;
                        var setting = parser.GetResult <FeedSetting>().Result;

                        if (request == null || setting == null)
                        {
                            continue;
                        }

                        var addrs = compile.GetResult(request.Uri.ToString());

                        for (int i = 0; i < addrs.Length; i++)
                        {
                            var addr = addrs[i].ToString();

                            var r = request.Clone() as Request;
                            r.Uri       = new Uri(addr);
                            setting.Id += "_" + i;
                            r.Tag       = JsonConvert.SerializeObject(setting);

                            var fr = new FeedRequest();
                            fr.Request    = r;
                            fr.Setting    = setting;
                            fr.Expression = parser.GetResult <ExtractBlock>().Expression;

                            requests.Add(fr);
                        }
                    }
                }

                return(requests);
            }
            catch (Exception ex)
            {
                Logger.GetLogger("").Info("get feed error " + ex.Message);

                return(new List <FeedRequest>());
            }
        }
示例#6
0
        public object Run(object t, ParallelTask task)
        {
            var model = t as CrawlTaskModel;

            var results  = new List <object>();
            var reporter = task.Progress as IProgress <string>;

            reporter.Report("正在读取Feed记录");
            var feed = FeedLiteDb.GetFeed(model.FeedId);

            reporter.Report("正在下载 Feed");

            var compile = new UrlCompile();
            var addrs   = compile.GetResult(feed.Address);

            foreach (var addr in addrs)
            {
                feed.Address = addr.ToString();

                var job  = new FeedJob();
                var snap = job.DoTask(feed, false);
                reporter.Report("Feed 下载完成");

                var block = RuiJiBlockParser.ParserBlock(feed.RuiJiExpression);

                var feedResult = RuiJiExtractor.Extract(snap.Content, block);
                results.Add(feedResult);

                reporter.Report("正在提取Feed地址");
                var j    = new FeedExtractJob();
                var urls = j.ExtractAddress(snap);
                reporter.Report("Feed地址提取完成");

                if (!string.IsNullOrEmpty(snap.RuiJiExpression))
                {
                    foreach (var url in urls)
                    {
                        reporter.Report("正在提取地址 " + url);
                        var result = Cooperater.GetResult(url);

                        if (result != null)
                        {
                            var cm = new ContentModel();
                            cm.Id    = model.FeedId;
                            cm.Url   = url;
                            cm.Metas = result.Metas;
                            cm.CDate = DateTime.Now;

                            results.Add(cm);
                        }
                    }
                }

                reporter.Report("计算完成");

                if (!model.IncludeContent)
                {
                    results.ForEach((m) =>
                    {
                        ClearContent(m);
                    });
                }
            }

            return(results);
        }