public void AddOrUpdateArticleMontior(ArticleMonitor monitor) { if (monitor.CurrentCount == 0) { monitor.Status = 0;//抓取异常 } if (monitor.CurrentCount > monitor.HistoryCount || monitor.CurrentPublishDate > monitor.HistoryPublishDate) { monitor.Status = 2;//有更新 } if (monitor.CurrentCount <= monitor.HistoryCount) { monitor.Status = 1;//暂无更新 } ArticleMonitor history = this.context.ArticleMonitors.FirstOrDefault(f => f.SiteName == monitor.SiteName); if (history == null) { history = new ArticleMonitor(); history.SiteName = monitor.SiteName; history.SiteUrl = monitor.SiteUrl; } history.Status = monitor.Status; history.StartTime = monitor.StartTime; history.EndTime = DateTime.Now; this.context.ArticleMonitors.Attach(history); this.context.Entry(history).State = history.Id < 1 ? EntityState.Added : EntityState.Modified; this.SaveChanges(); this.context.Entry(history).State = EntityState.Detached; }
public void AddOrUpdateArticles(IEnumerable <Article> articles, ArticleMonitor monitor) { monitor.HistoryCount = this.context.Articles.Where(s => s.SiteName == monitor.SiteName)?.Count() ?? 0; monitor.HistoryPublishDate = this.context.Articles.Where(a => a.SiteName == monitor.SiteName && a.PublishDate != null)?.Max(a => a.PublishDate); monitor.CurrentCount = articles?.Count() ?? 0; monitor.CurrentPublishDate = articles?.Where(a => a != null && a.PublishDate != null)?.Max(a => a.PublishDate); if (articles == null) { return; } articles = articles .Where(article => !string.IsNullOrWhiteSpace(article?.Content)); var groups = articles .GroupBy(article => article.Url); var duplicated = groups .Where(group => group.Count() > 1) .Select(group => group.First()); foreach (var article in duplicated) { Logging.WriteEntry(this, LogType.Warning, $"Article {article.Url} is duplicated."); } articles = groups .Select(group => group.First()); var keys = articles.Select(article => article.Url); var existKeys = this.context .Articles .AsNoTracking() .Where(article => keys.Contains(article.Url)) .Select(article => article.Url) .ToArray(); foreach (var article in articles) { this.context.Articles.Attach(article); this.context.Entry(article).State = existKeys.Contains(article.Url) ? EntityState.Modified : EntityState.Added; this.SaveChanges(); this.context.Entry(article).State = EntityState.Detached; } }
public void Crawl(SiteParameter siteParameter) { Stopwatch stopwatch = Stopwatch.StartNew(); ArticleMonitor monitor = new ArticleMonitor() { StartTime = DateTime.Now, SiteName = siteParameter.SiteName }; if (!string.IsNullOrWhiteSpace(siteParameter.StartUrl)) { monitor.SiteUrl = siteParameter.StartUrl; } else { monitor.SiteUrl = string.Format(siteParameter.UrlPattern, siteParameter.StartNumber, siteParameter.PageStepNumber); } IEnumerable <Article> articles = this.pageReader.GetArticals().ToArray(); articles = articles.Select(article => this.pageParser.GetArticleDetails(article)).ToArray(); this.dataService.AddOrUpdateArticles(articles, monitor); int attachmentCount = 0; foreach (var article in articles) { var attatchments = this.pageParser.GetAttachments(article); attachmentCount += attatchments?.Count() ?? 0; this.dataService.AddOrUpdateArticleAttachments(attatchments); } this.dataService.AddOrUpdateArticleMontior(monitor); string info = string.Format("{0} articles crawled, {1} attachments crawled.", articles.Count(), attachmentCount); Logging.WriteEntry(this, LogType.Information, info); Logging.WriteEntry(this, LogType.Information, $"{stopwatch.Elapsed} elapsed."); }