public static void UpdateChannelDefaultEngine(ObjectId channelId, RssCrawlerEngine engine) { using (var db = new LiteDatabase(GetDbPath())) { var col = db.GetCollection <RssChannelRow>("channels"); // Create unique index in Id field col.EnsureIndex(x => x.Id, true); // UPDATE dbo.RssChannels SET RssCrawlerEngine = @engine WHERE Id = @channelId var channel = col.FindById(channelId); channel.RssCrawlerEngine = engine; col.Update(channel); } }
/// <summary> /// GetFeedsFromChannel /// </summary> /// <param name="feedUrl">feed Url</param> /// <param name="defaultEngineType">crawler engine</param> /// <param name="isRest"> /// Normally we call this method two times, first times with 'default' channel's crawler engine, and the last times for the rest crawler engine /// isRest = false: FIRST TIMES /// isRest = true: LAST TIMES /// </param> /// <param name="engineTypeResult"></param> /// <param name="error"></param> /// <returns></returns> public static SimpleFeedlyFeed GetFeedsFromChannel(string feedUrl, RssCrawlerEngine defaultEngineType, out RssCrawlerEngine engineTypeResult, out Exception error) { IRssEngine getEngine(RssCrawlerEngine type) { IRssEngine tmpEngine = null; switch (type) { case RssCrawlerEngine.SyndicationFeed: tmpEngine = new SyndicationFeedEngine(); break; case RssCrawlerEngine.CodeHollowFeedReader: tmpEngine = new CodeHollowFeedReaderEngine(); break; case RssCrawlerEngine.ParseRssByXml: tmpEngine = new ParseRssByXmlEngine(); break; default: break; } if (tmpEngine == null) { throw new Exception($"Can not find crawler engine for type <{type}>"); } return(tmpEngine); } RssCrawlerEngine currentEngineType = RssCrawlerEngine.CodeHollowFeedReader; var items = new List <SimpleFeedlyFeedItem>(); try { // check default engine first IRssEngine rssEngine = getEngine(defaultEngineType); var feedItems = rssEngine.GetItems(feedUrl, out error); if (error == null && feedItems.Count > 0) // no error { currentEngineType = defaultEngineType; items = feedItems ?? new List <SimpleFeedlyFeedItem>(); } else { // check the rest engines error = null; foreach (RssCrawlerEngine engineLoop in (RssCrawlerEngine[])Enum.GetValues(typeof(RssCrawlerEngine))) { if (engineLoop == defaultEngineType) { continue; } currentEngineType = engineLoop; rssEngine = getEngine(engineLoop); feedItems = rssEngine.GetItems(feedUrl, out error); items.AddRange(feedItems ?? new List <SimpleFeedlyFeedItem>()); if (error == null && feedItems.Count > 0) // no error { items = feedItems ?? new List <SimpleFeedlyFeedItem>(); break; } } } } catch (Exception ex) { error = ex; } engineTypeResult = currentEngineType; return(new SimpleFeedlyFeed { Items = items ?? new List <SimpleFeedlyFeedItem>() }); }
public static void InitializeRssCrawler(ILogger logger, RandomTimeSpan channelFetchingDelay, TimeSpan channelErrorDelay, TimeSpan errorDelay, TimeSpan loopDelay) { System.Threading.Tasks.Task.Run(() => { //var channelHubCtx = GlobalHost.ConnectionManager.GetHubContext<ChannelHub>(); ObjectCache cache = MemoryCache.Default; while (true) { if (_currentDate != DateTime.Now.Day) { _feedCache = new HashSet <string>(); _currentDate = DateTime.Now.Day; } var feedUrl = string.Empty; try { List <RssChannelsRow> channels = channels = SimpleFeedlyDatabaseAccess.GetActiveChannels().OrderBy(x => x.Id).ToList(); var count = 0; foreach (var channel in channels) { feedUrl = channel.Link; count++; logger.Info($"- [{count}/{channels.Count}] Working on channel: {channel.Id} | {feedUrl}"); //channelHubCtx.Clients.All.updateChannelProgress(new { Message = $"<strong>Fetching</strong> <a href='{channel.Link}' target='_blank'>{channel.Link}</a>", IsSleeping = false }); if (string.IsNullOrWhiteSpace(feedUrl)) { logger.Warn($"=> Channel has empty link: {channel.Id}"); continue; } var channelSleepingCacheKey = "channel_is_sleeping|" + channel.Id; var isSleeping = cache[channelSleepingCacheKey] as bool?; if (isSleeping == null || isSleeping == false) { try { RssCrawlerEngine usedEngine = RssCrawlerEngine.CodeHollowFeedReader; var feed = RssCrawler.GetFeedsFromChannel(feedUrl, channel.RssCrawlerEngine, false, out usedEngine, out Exception fetchFeedError); // update default engine for channel SimpleFeedlyDatabaseAccess.UpdateChannelDefaultEngine((long)channel.Id, feed == null ? (RssCrawlerEngine?)null : usedEngine); if (feed != null) { logger.Info($" + Number of items: {feed.Items.Count}"); var hasNew = false; foreach (var fItem in feed.Items) { if (!StringUtils.IsUrl(fItem.Link)) { continue; } var feedItemKey = GenerateFeedItemKey(fItem); var feedCacheKey = GenerateFeedCacheKey((long)channel.Id, feedItemKey); if (string.IsNullOrWhiteSpace(feedItemKey) || string.IsNullOrWhiteSpace(fItem.Link)) { logger.Info($" + Skipped item: {JsonConvert.SerializeObject(fItem)}"); continue; } if (!_feedCache.Contains(feedCacheKey)) { if (!SimpleFeedlyDatabaseAccess.CheckExistFeedItem((long)channel.Id, feedItemKey)) { var feedItem = new RssFeedItemsRow { ChannelId = channel.Id, FeedItemKey = feedItemKey, Title = string.IsNullOrWhiteSpace(fItem.Title) ? fItem.Link : fItem.Title, Link = fItem.Link, Description = fItem.Description, PublishingDate = fItem.PublishingDate, Author = fItem.Author, Content = fItem.Content }; SimpleFeedlyDatabaseAccess.InsertFeedItem(feedItem); hasNew = true; } _feedCache.Add(feedCacheKey); } } SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus((long)channel.Id, false, null); if (!hasNew) { var randomExpiryTime = cache.Add(channelSleepingCacheKey, true, DateTime.Now.Add(channelFetchingDelay.GenerateRamdomValue())); } } else { SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus((long)channel.Id, true, fetchFeedError == null ? null : JsonConvert.SerializeObject(fetchFeedError)); if (fetchFeedError != null) { ErrorHandle(fetchFeedError, feedUrl); } } } catch (Exception err) { SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus((long)channel.Id, true, JsonConvert.SerializeObject(err)); cache.Add(channelSleepingCacheKey, true, DateTime.Now.Add(channelErrorDelay)); logger.Error(err, $"An error occurred on channel: {channel.Id} | {feedUrl}"); ErrorHandle(err, feedUrl); } } else { logger.Info($" + sleeping..."); } } } catch (Exception ex) { logger.Error(ex, "An error occurred"); ErrorHandle(ex, feedUrl); System.Threading.Thread.Sleep(errorDelay); } //channelHubCtx.Clients.All.updateChannelProgress(new { Message = "<span class='link-muted'>Crawler's sleeping...</span>", IsSleeping = true }); _currentDate = DateTime.Now.Day; // we should delay a little bit, some seconds maybe System.Threading.Thread.Sleep(loopDelay); } }); }
/// <summary> /// GetFeedsFromChannel /// </summary> /// <param name="feedUrl">feed Url</param> /// <param name="crawlerEngine">crawler engine</param> /// <param name="isRest"> /// Normally we call this method two times, first times with 'default' channel's crawler engine, and the last times for the rest crawler engine /// isRest = false: FIRST TIMES /// isRest = true: LAST TIMES /// </param> /// <param name="engine"></param> /// <param name="error"></param> /// <returns></returns> public static SimpleFeedlyFeed GetFeedsFromChannel(string feedUrl, RssCrawlerEngine?defaultCrawlerEngine, bool isRest, out RssCrawlerEngine engine, out Exception error) { error = null; engine = RssCrawlerEngine.CodeHollowFeedReader; SimpleFeedlyFeed result = new SimpleFeedlyFeed(); var items = new List <SimpleFeedlyFeedItem>(); var status = false; foreach (RssCrawlerEngine engineLoop in (RssCrawlerEngine[])Enum.GetValues(typeof(RssCrawlerEngine))) { if (status) { break; } var canRun = false; if (defaultCrawlerEngine == null) { canRun = true; } else { if (!isRest) // first time { canRun = engineLoop == defaultCrawlerEngine; } else { canRun = engineLoop != defaultCrawlerEngine; } } if (!canRun) { continue; } if (engineLoop == RssCrawlerEngine.CodeHollowFeedReader && canRun) { if (!status) { try { var feed = CodeHollow.FeedReader.FeedReader.ReadAsync(feedUrl).GetAwaiter().GetResult(); foreach (var item in feed.Items) { var feedItem = new SimpleFeedlyFeedItem { Id = item.Id, Title = string.IsNullOrWhiteSpace(item.Title) ? item.Link : item.Title, Link = item.Link, Description = item.Description, PublishingDate = item.PublishingDate ?? DateTime.Now, Author = item.Author, Content = item.Content }; items.Add(feedItem); } engine = RssCrawlerEngine.CodeHollowFeedReader; status = true; } catch (Exception ex) { error = ex; } } } if (engineLoop == RssCrawlerEngine.SyndicationFeed && canRun) { if (!status) { try { XmlReaderSettings settings = new XmlReaderSettings(); settings.DtdProcessing = DtdProcessing.Parse; using (var reader = XmlReader.Create(feedUrl, settings)) { var feed = System.ServiceModel.Syndication.SyndicationFeed.Load(reader); reader.Close(); foreach (System.ServiceModel.Syndication.SyndicationItem item in feed.Items) { var feedItem = new SimpleFeedlyFeedItem(); var link = item.Links.FirstOrDefault()?.Uri.ToString(); link = string.IsNullOrWhiteSpace(link) ? item.Id : link; feedItem.Id = item.Id; feedItem.Title = string.IsNullOrWhiteSpace(item.Title?.Text) ? link : item.Title.Text; feedItem.Link = link; feedItem.Description = item.Summary?.Text; feedItem.PublishingDate = item.PublishDate.UtcDateTime; feedItem.Author = item.Authors.FirstOrDefault()?.Name ?? string.Empty; feedItem.Content = item.Content?.ToString(); items.Add(feedItem); } } engine = RssCrawlerEngine.SyndicationFeed; status = true; } catch (Exception ex) { error = ex; } } } if (engineLoop == RssCrawlerEngine.ParseRssByXml && canRun) { if (!status) { try { var xmlString = string.Empty; using (WebClient client = new WebClient()) { var htmlData = client.DownloadData(feedUrl); xmlString = System.Text.Encoding.UTF8.GetString(htmlData); // ReplaceHexadecimalSymbols string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]"; xmlString = Regex.Replace(xmlString, r, "", RegexOptions.Compiled); } XmlDocument rssXmlDoc = new XmlDocument(); rssXmlDoc.LoadXml(xmlString); // Parse the Items in the RSS file XmlNodeList rssNodes = rssXmlDoc.SelectNodes("rss/channel/item"); // Iterate through the items in the RSS file foreach (XmlNode rssNode in rssNodes) { var feedItem = new SimpleFeedlyFeedItem(); XmlNode rssSubNode = rssNode.SelectSingleNode("link"); feedItem.Link = rssSubNode != null ? rssSubNode.InnerText : null; rssSubNode = rssNode.SelectSingleNode("title"); feedItem.Title = rssSubNode != null ? rssSubNode.InnerText : null; feedItem.Title = string.IsNullOrWhiteSpace(feedItem.Title) ? feedItem.Link : feedItem.Title; rssSubNode = rssNode.SelectSingleNode("description"); feedItem.Description = rssSubNode != null ? rssSubNode.InnerText : null; rssSubNode = rssNode.SelectSingleNode("pubDate"); DateTime pubDate = DateTime.Now; if (rssSubNode != null) { if (DateTime.TryParse(rssSubNode.InnerText, out DateTime tmpDate)) { pubDate = tmpDate; } } feedItem.PublishingDate = pubDate; if (!string.IsNullOrWhiteSpace(feedItem.Link)) { items.Add(feedItem); } } engine = RssCrawlerEngine.ParseRssByXml; status = true; } catch (Exception ex) { error = ex; } } } } // isRest == false => if it's the first time, we maybe need to call 2nd times // status == false => we maybe need to call 2nd times if current engines did not return anything // defaultCrawlerEngine = null will process rss with all engines, therefor we don't need to call 2nd times if (isRest == false && !status && defaultCrawlerEngine != null) { return(GetFeedsFromChannel(feedUrl, defaultCrawlerEngine, true, out engine, out error)); } if (!status) { return(null); } else { error = null; result.Items = items; return(result); } }