public static void InsertFeedItem(RssFeedItemRow item) { using (var db = new LiteDatabase(GetDbPath())) { var col = db.GetCollection <RssFeedItemRow>("feedItems"); //item.RssChannelDomainGroup = string.IsNullOrEmpty(item.RssChannelDomainGroup) ? item.Link : item.RssChannelDomainGroup; item.PublishingDate = item.PublishingDate == null || item.PublishingDate == DateTime.MinValue ? DateTime.Now : item.PublishingDate; col.Insert(item); } }
public static void CrawlRss() { var logFolder = $"{Path.Combine(EnvironmentHelper.GetApplicationRoot(), "../Logs/")}"; if (!Directory.Exists(logFolder)) { Directory.CreateDirectory(logFolder); } var existedLogFiles = Directory.GetFiles(logFolder); var logFileBank = new Dictionary <DateTime, string>(); foreach (var file in existedLogFiles) { var dtFormatArr = Path.GetFileNameWithoutExtension(file).Split('-'); if (dtFormatArr.Length > 0) { var dtText = dtFormatArr[0]; if (DateTime.TryParseExact(dtText, LogDateTimeFormat, CultureInfo.InvariantCulture, DateTimeStyles.None, out DateTime dt)) { logFileBank.Add(dt, file); } } } foreach (var kv in logFileBank) { if (kv.Key < DateTime.Now.AddDays(-10)) // just keep 10 days nearest { File.Delete(kv.Value); } } var logPath = $"{Path.Combine(logFolder, $"{DateTime.Now.ToString(LogDateTimeFormat)}-crawler.txt")}"; var _logger = new MyLogger(logPath); var feedUrl = string.Empty; try { List <RssChannelRow> channels = channels = SimpleFeedlyDatabaseAccess.GetActiveChannels().OrderBy(x => x.Id).ToList(); _logger.Info($"There are {channels.Count} active channels"); //channels = channels.Take(5).ToList(); var progressCounter = 0; foreach (var channel in channels) { progressCounter++; feedUrl = channel.Link; if (string.IsNullOrWhiteSpace(feedUrl)) { continue; } try { _logger.Info($"- [{progressCounter}/{channels.Count}] Fetching url {feedUrl}"); var feed = GetFeedsFromChannel(feedUrl, channel.RssCrawlerEngine, out RssCrawlerEngine usedEngine, out Exception fetchFeedError); _logger.Info($" - Nbr of feed items {feed?.Items?.Count ?? 0}"); // update default engine for channel SimpleFeedlyDatabaseAccess.UpdateChannelDefaultEngine(channel.Id, fetchFeedError != null ? RssCrawlerEngine.CodeHollowFeedReader : usedEngine); if (feed != null && feed?.Items != null) { var top10LatestItems = feed.Items; //.OrderByDescending(x => x.PublishingDate) //.Take(10) //.ToList(); if (top10LatestItems.Count == 0) { continue; } else { SimpleFeedlyDatabaseAccess.DeleteAllFeedItemByChannelId(channel.Id); _logger.Info($" - Deleted old items"); var insertItems = new List <RssFeedItemRow>(); foreach (var fItem in top10LatestItems) { if (!StringUtils.IsUrl(fItem.Link)) { continue; } var feedItemKey = GenerateFeedItemKey(fItem); if (string.IsNullOrWhiteSpace(feedItemKey) || string.IsNullOrWhiteSpace(fItem.Link)) { continue; } var feedItem = new RssFeedItemRow { Channel = channel, FeedItemKey = feedItemKey, Title = string.IsNullOrWhiteSpace(fItem.Title) ? fItem.Link : fItem.Title, Link = fItem.Link, Description = fItem.Description, PublishingDate = fItem.PublishingDate, Author = fItem.Author }; var shrinkedTitle = StringUtils.UnsignString(StringUtils.RemoveNonAlphaCharactersAndDigit(feedItem.Title)).ToLower(); var shrinkedTitleHash = StringUtils.MD5Hash(shrinkedTitle); if (!SimpleFeedlyDatabaseAccess.IsBlackListWord(shrinkedTitleHash)) { var channelDomainGroup = string.IsNullOrEmpty(channel.DomainGroup) ? channel.Link : channel.DomainGroup; if (!SimpleFeedlyDatabaseAccess.IsExistedFeedItem(channel.Id, channelDomainGroup, feedItem.FeedItemKey)) { //var coverImageUrl = fItem.GetFeedCoverImage(); //if (!string.IsNullOrWhiteSpace(coverImageUrl)) //{ // feedItem.CoverImageUrl = coverImageUrl; //} insertItems.Add(feedItem); } } } SimpleFeedlyDatabaseAccess.InsertFeedItems(insertItems); _logger.Info($" - Inserted {insertItems.Count()} items"); } SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, false, null); _logger.Info($" - Updated status"); } else { _logger.Info($" - [NO ITEMS]"); SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, true, fetchFeedError == null ? null : JsonConvert.SerializeObject(fetchFeedError)); if (fetchFeedError != null) { ErrorHandle(fetchFeedError, feedUrl); } } } catch (Exception err) { _logger.Error($" - Got Error: {JsonConvert.SerializeObject(err, new JsonSerializerSettings { ReferenceLoopHandling = ReferenceLoopHandling.Ignore })}"); SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, true, JsonConvert.SerializeObject(err)); ErrorHandle(err, feedUrl); } } } catch (Exception ex) { _logger.Error($" - [ERROR]: {JsonConvert.SerializeObject(ex, new JsonSerializerSettings { ReferenceLoopHandling = ReferenceLoopHandling.Ignore })}"); ErrorHandle(ex, feedUrl); } _logger.Info($"Done!"); }