コード例 #1
0
        public void AddOrUpdateArticleMontior(ArticleMonitor monitor)
        {
            if (monitor.CurrentCount == 0)
            {
                monitor.Status = 0;//抓取异常
            }

            if (monitor.CurrentCount > monitor.HistoryCount || monitor.CurrentPublishDate > monitor.HistoryPublishDate)
            {
                monitor.Status = 2;//有更新
            }

            if (monitor.CurrentCount <= monitor.HistoryCount)
            {
                monitor.Status = 1;//暂无更新
            }

            ArticleMonitor history = this.context.ArticleMonitors.FirstOrDefault(f => f.SiteName == monitor.SiteName);

            if (history == null)
            {
                history          = new ArticleMonitor();
                history.SiteName = monitor.SiteName;
                history.SiteUrl  = monitor.SiteUrl;
            }
            history.Status    = monitor.Status;
            history.StartTime = monitor.StartTime;
            history.EndTime   = DateTime.Now;
            this.context.ArticleMonitors.Attach(history);
            this.context.Entry(history).State = history.Id < 1 ? EntityState.Added : EntityState.Modified;
            this.SaveChanges();
            this.context.Entry(history).State = EntityState.Detached;
        }
コード例 #2
0
        public void AddOrUpdateArticles(IEnumerable <Article> articles, ArticleMonitor monitor)
        {
            monitor.HistoryCount       = this.context.Articles.Where(s => s.SiteName == monitor.SiteName)?.Count() ?? 0;
            monitor.HistoryPublishDate = this.context.Articles.Where(a => a.SiteName == monitor.SiteName && a.PublishDate != null)?.Max(a => a.PublishDate);
            monitor.CurrentCount       = articles?.Count() ?? 0;
            monitor.CurrentPublishDate = articles?.Where(a => a != null && a.PublishDate != null)?.Max(a => a.PublishDate);
            if (articles == null)
            {
                return;
            }

            articles = articles
                       .Where(article => !string.IsNullOrWhiteSpace(article?.Content));

            var groups = articles
                         .GroupBy(article => article.Url);

            var duplicated = groups
                             .Where(group => group.Count() > 1)
                             .Select(group => group.First());

            foreach (var article in duplicated)
            {
                Logging.WriteEntry(this, LogType.Warning, $"Article {article.Url} is duplicated.");
            }

            articles = groups
                       .Select(group => group.First());

            var keys = articles.Select(article => article.Url);

            var existKeys = this.context
                            .Articles
                            .AsNoTracking()
                            .Where(article => keys.Contains(article.Url))
                            .Select(article => article.Url)
                            .ToArray();

            foreach (var article in articles)
            {
                this.context.Articles.Attach(article);
                this.context.Entry(article).State = existKeys.Contains(article.Url) ? EntityState.Modified : EntityState.Added;
                this.SaveChanges();
                this.context.Entry(article).State = EntityState.Detached;
            }
        }
コード例 #3
0
        public void Crawl(SiteParameter siteParameter)
        {
            Stopwatch      stopwatch = Stopwatch.StartNew();
            ArticleMonitor monitor   = new ArticleMonitor()
            {
                StartTime = DateTime.Now, SiteName = siteParameter.SiteName
            };

            if (!string.IsNullOrWhiteSpace(siteParameter.StartUrl))
            {
                monitor.SiteUrl = siteParameter.StartUrl;
            }
            else
            {
                monitor.SiteUrl = string.Format(siteParameter.UrlPattern, siteParameter.StartNumber, siteParameter.PageStepNumber);
            }
            IEnumerable <Article> articles = this.pageReader.GetArticals().ToArray();

            articles = articles.Select(article => this.pageParser.GetArticleDetails(article)).ToArray();

            this.dataService.AddOrUpdateArticles(articles, monitor);

            int attachmentCount = 0;

            foreach (var article in articles)
            {
                var attatchments = this.pageParser.GetAttachments(article);
                attachmentCount += attatchments?.Count() ?? 0;

                this.dataService.AddOrUpdateArticleAttachments(attatchments);
            }
            this.dataService.AddOrUpdateArticleMontior(monitor);
            string info = string.Format("{0} articles crawled, {1} attachments crawled.", articles.Count(), attachmentCount);

            Logging.WriteEntry(this, LogType.Information, info);

            Logging.WriteEntry(this, LogType.Information, $"{stopwatch.Elapsed} elapsed.");
        }