private static string GenerateFeedItemKey(SimpleFeedlyFeedItem item) { if (string.IsNullOrWhiteSpace(item.Id)) { if (string.IsNullOrWhiteSpace(item.Link)) { return(null); } else { return(item.Link); } } else { return(item.Id); } }
/// <summary> /// GetFeedsFromChannel /// </summary> /// <param name="feedUrl">feed Url</param> /// <param name="crawlerEngine">crawler engine</param> /// <param name="isRest"> /// Normally we call this method two times, first times with 'default' channel's crawler engine, and the last times for the rest crawler engine /// isRest = false: FIRST TIMES /// isRest = true: LAST TIMES /// </param> /// <param name="engine"></param> /// <param name="error"></param> /// <returns></returns> public static SimpleFeedlyFeed GetFeedsFromChannel(string feedUrl, RssCrawlerEngine?defaultCrawlerEngine, bool isRest, out RssCrawlerEngine engine, out Exception error) { error = null; engine = RssCrawlerEngine.CodeHollowFeedReader; SimpleFeedlyFeed result = new SimpleFeedlyFeed(); var items = new List <SimpleFeedlyFeedItem>(); var status = false; foreach (RssCrawlerEngine engineLoop in (RssCrawlerEngine[])Enum.GetValues(typeof(RssCrawlerEngine))) { if (status) { break; } var canRun = false; if (defaultCrawlerEngine == null) { canRun = true; } else { if (!isRest) // first time { canRun = engineLoop == defaultCrawlerEngine; } else { canRun = engineLoop != defaultCrawlerEngine; } } if (!canRun) { continue; } if (engineLoop == RssCrawlerEngine.CodeHollowFeedReader && canRun) { if (!status) { try { var feed = CodeHollow.FeedReader.FeedReader.ReadAsync(feedUrl).GetAwaiter().GetResult(); foreach (var item in feed.Items) { var feedItem = new SimpleFeedlyFeedItem { Id = item.Id, Title = string.IsNullOrWhiteSpace(item.Title) ? item.Link : item.Title, Link = item.Link, Description = item.Description, PublishingDate = item.PublishingDate ?? DateTime.Now, Author = item.Author, Content = item.Content }; items.Add(feedItem); } engine = RssCrawlerEngine.CodeHollowFeedReader; status = true; } catch (Exception ex) { error = ex; } } } if (engineLoop == RssCrawlerEngine.SyndicationFeed && canRun) { if (!status) { try { XmlReaderSettings settings = new XmlReaderSettings(); settings.DtdProcessing = DtdProcessing.Parse; using (var reader = XmlReader.Create(feedUrl, settings)) { var feed = System.ServiceModel.Syndication.SyndicationFeed.Load(reader); reader.Close(); foreach (System.ServiceModel.Syndication.SyndicationItem item in feed.Items) { var feedItem = new SimpleFeedlyFeedItem(); var link = item.Links.FirstOrDefault()?.Uri.ToString(); link = string.IsNullOrWhiteSpace(link) ? item.Id : link; feedItem.Id = item.Id; feedItem.Title = string.IsNullOrWhiteSpace(item.Title?.Text) ? link : item.Title.Text; feedItem.Link = link; feedItem.Description = item.Summary?.Text; feedItem.PublishingDate = item.PublishDate.UtcDateTime; feedItem.Author = item.Authors.FirstOrDefault()?.Name ?? string.Empty; feedItem.Content = item.Content?.ToString(); items.Add(feedItem); } } engine = RssCrawlerEngine.SyndicationFeed; status = true; } catch (Exception ex) { error = ex; } } } if (engineLoop == RssCrawlerEngine.ParseRssByXml && canRun) { if (!status) { try { var xmlString = string.Empty; using (WebClient client = new WebClient()) { var htmlData = client.DownloadData(feedUrl); xmlString = System.Text.Encoding.UTF8.GetString(htmlData); // ReplaceHexadecimalSymbols string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]"; xmlString = Regex.Replace(xmlString, r, "", RegexOptions.Compiled); } XmlDocument rssXmlDoc = new XmlDocument(); rssXmlDoc.LoadXml(xmlString); // Parse the Items in the RSS file XmlNodeList rssNodes = rssXmlDoc.SelectNodes("rss/channel/item"); // Iterate through the items in the RSS file foreach (XmlNode rssNode in rssNodes) { var feedItem = new SimpleFeedlyFeedItem(); XmlNode rssSubNode = rssNode.SelectSingleNode("link"); feedItem.Link = rssSubNode != null ? rssSubNode.InnerText : null; rssSubNode = rssNode.SelectSingleNode("title"); feedItem.Title = rssSubNode != null ? rssSubNode.InnerText : null; feedItem.Title = string.IsNullOrWhiteSpace(feedItem.Title) ? feedItem.Link : feedItem.Title; rssSubNode = rssNode.SelectSingleNode("description"); feedItem.Description = rssSubNode != null ? rssSubNode.InnerText : null; rssSubNode = rssNode.SelectSingleNode("pubDate"); DateTime pubDate = DateTime.Now; if (rssSubNode != null) { if (DateTime.TryParse(rssSubNode.InnerText, out DateTime tmpDate)) { pubDate = tmpDate; } } feedItem.PublishingDate = pubDate; if (!string.IsNullOrWhiteSpace(feedItem.Link)) { items.Add(feedItem); } } engine = RssCrawlerEngine.ParseRssByXml; status = true; } catch (Exception ex) { error = ex; } } } } // isRest == false => if it's the first time, we maybe need to call 2nd times // status == false => we maybe need to call 2nd times if current engines did not return anything // defaultCrawlerEngine = null will process rss with all engines, therefor we don't need to call 2nd times if (isRest == false && !status && defaultCrawlerEngine != null) { return(GetFeedsFromChannel(feedUrl, defaultCrawlerEngine, true, out engine, out error)); } if (!status) { return(null); } else { error = null; result.Items = items; return(result); } }