Exemplo n.º 1
0
 private static string GenerateFeedItemKey(SimpleFeedlyFeedItem item)
 {
     if (string.IsNullOrWhiteSpace(item.Id))
     {
         if (string.IsNullOrWhiteSpace(item.Link))
         {
             return(null);
         }
         else
         {
             return(item.Link);
         }
     }
     else
     {
         return(item.Id);
     }
 }
Exemplo n.º 2
0
        /// <summary>
        /// GetFeedsFromChannel
        /// </summary>
        /// <param name="feedUrl">feed Url</param>
        /// <param name="crawlerEngine">crawler engine</param>
        /// <param name="isRest">
        /// Normally we call this method two times, first times with 'default' channel's crawler engine, and the last times for the rest crawler engine
        /// isRest = false: FIRST TIMES
        /// isRest = true: LAST TIMES
        /// </param>
        /// <param name="engine"></param>
        /// <param name="error"></param>
        /// <returns></returns>
        public static SimpleFeedlyFeed GetFeedsFromChannel(string feedUrl, RssCrawlerEngine?defaultCrawlerEngine, bool isRest, out RssCrawlerEngine engine, out Exception error)
        {
            error  = null;
            engine = RssCrawlerEngine.CodeHollowFeedReader;
            SimpleFeedlyFeed result = new SimpleFeedlyFeed();
            var items = new List <SimpleFeedlyFeedItem>();

            var status = false;

            foreach (RssCrawlerEngine engineLoop in (RssCrawlerEngine[])Enum.GetValues(typeof(RssCrawlerEngine)))
            {
                if (status)
                {
                    break;
                }

                var canRun = false;

                if (defaultCrawlerEngine == null)
                {
                    canRun = true;
                }
                else
                {
                    if (!isRest) // first time
                    {
                        canRun = engineLoop == defaultCrawlerEngine;
                    }
                    else
                    {
                        canRun = engineLoop != defaultCrawlerEngine;
                    }
                }

                if (!canRun)
                {
                    continue;
                }

                if (engineLoop == RssCrawlerEngine.CodeHollowFeedReader && canRun)
                {
                    if (!status)
                    {
                        try
                        {
                            var feed = CodeHollow.FeedReader.FeedReader.ReadAsync(feedUrl).GetAwaiter().GetResult();

                            foreach (var item in feed.Items)
                            {
                                var feedItem = new SimpleFeedlyFeedItem
                                {
                                    Id             = item.Id,
                                    Title          = string.IsNullOrWhiteSpace(item.Title) ? item.Link : item.Title,
                                    Link           = item.Link,
                                    Description    = item.Description,
                                    PublishingDate = item.PublishingDate ?? DateTime.Now,
                                    Author         = item.Author,
                                    Content        = item.Content
                                };

                                items.Add(feedItem);
                            }

                            engine = RssCrawlerEngine.CodeHollowFeedReader;
                            status = true;
                        }
                        catch (Exception ex)
                        {
                            error = ex;
                        }
                    }
                }

                if (engineLoop == RssCrawlerEngine.SyndicationFeed && canRun)
                {
                    if (!status)
                    {
                        try
                        {
                            XmlReaderSettings settings = new XmlReaderSettings();
                            settings.DtdProcessing = DtdProcessing.Parse;

                            using (var reader = XmlReader.Create(feedUrl, settings))
                            {
                                var feed = System.ServiceModel.Syndication.SyndicationFeed.Load(reader);
                                reader.Close();

                                foreach (System.ServiceModel.Syndication.SyndicationItem item in feed.Items)
                                {
                                    var feedItem = new SimpleFeedlyFeedItem();

                                    var link = item.Links.FirstOrDefault()?.Uri.ToString();
                                    link = string.IsNullOrWhiteSpace(link) ? item.Id : link;

                                    feedItem.Id             = item.Id;
                                    feedItem.Title          = string.IsNullOrWhiteSpace(item.Title?.Text) ? link : item.Title.Text;
                                    feedItem.Link           = link;
                                    feedItem.Description    = item.Summary?.Text;
                                    feedItem.PublishingDate = item.PublishDate.UtcDateTime;
                                    feedItem.Author         = item.Authors.FirstOrDefault()?.Name ?? string.Empty;
                                    feedItem.Content        = item.Content?.ToString();

                                    items.Add(feedItem);
                                }
                            }

                            engine = RssCrawlerEngine.SyndicationFeed;
                            status = true;
                        }
                        catch (Exception ex)
                        {
                            error = ex;
                        }
                    }
                }

                if (engineLoop == RssCrawlerEngine.ParseRssByXml && canRun)
                {
                    if (!status)
                    {
                        try
                        {
                            var xmlString = string.Empty;
                            using (WebClient client = new WebClient())
                            {
                                var htmlData = client.DownloadData(feedUrl);
                                xmlString = System.Text.Encoding.UTF8.GetString(htmlData);

                                // ReplaceHexadecimalSymbols
                                string r = "[\x00-\x08\x0B\x0C\x0E-\x1F\x26]";
                                xmlString = Regex.Replace(xmlString, r, "", RegexOptions.Compiled);
                            }

                            XmlDocument rssXmlDoc = new XmlDocument();
                            rssXmlDoc.LoadXml(xmlString);

                            // Parse the Items in the RSS file
                            XmlNodeList rssNodes = rssXmlDoc.SelectNodes("rss/channel/item");

                            // Iterate through the items in the RSS file
                            foreach (XmlNode rssNode in rssNodes)
                            {
                                var feedItem = new SimpleFeedlyFeedItem();

                                XmlNode rssSubNode = rssNode.SelectSingleNode("link");
                                feedItem.Link = rssSubNode != null ? rssSubNode.InnerText : null;

                                rssSubNode     = rssNode.SelectSingleNode("title");
                                feedItem.Title = rssSubNode != null ? rssSubNode.InnerText : null;
                                feedItem.Title = string.IsNullOrWhiteSpace(feedItem.Title) ? feedItem.Link : feedItem.Title;

                                rssSubNode           = rssNode.SelectSingleNode("description");
                                feedItem.Description = rssSubNode != null ? rssSubNode.InnerText : null;

                                rssSubNode = rssNode.SelectSingleNode("pubDate");
                                DateTime pubDate = DateTime.Now;

                                if (rssSubNode != null)
                                {
                                    if (DateTime.TryParse(rssSubNode.InnerText, out DateTime tmpDate))
                                    {
                                        pubDate = tmpDate;
                                    }
                                }

                                feedItem.PublishingDate = pubDate;

                                if (!string.IsNullOrWhiteSpace(feedItem.Link))
                                {
                                    items.Add(feedItem);
                                }
                            }

                            engine = RssCrawlerEngine.ParseRssByXml;
                            status = true;
                        }
                        catch (Exception ex)
                        {
                            error = ex;
                        }
                    }
                }
            }

            // isRest == false => if it's the first time, we maybe need to call 2nd times
            // status == false => we maybe need to call 2nd times if current engines did not return anything
            // defaultCrawlerEngine = null will process rss with all engines, therefor we don't need to call 2nd times
            if (isRest == false && !status && defaultCrawlerEngine != null)
            {
                return(GetFeedsFromChannel(feedUrl, defaultCrawlerEngine, true, out engine, out error));
            }

            if (!status)
            {
                return(null);
            }
            else
            {
                error        = null;
                result.Items = items;
                return(result);
            }
        }