Beispiel #1
0
        private async Task ParseAtomFeed(FeedCrawlerConfig config, XDocument doc, FeedResult crawlerResult)
        {
            if (doc.Root == null)
            {
                return;
            }

            var atomItems = doc.Root.Elements()
                            .Where(i => i.Name.LocalName == "entry");

            bool categoryFilter = config.FilterByCategories.Any();

            foreach (var atomItem in atomItems)
            {
                var crawlerResultItem = new FeedResult.FeedItem
                {
                    Title = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "title")?.Value,
                    Href  = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "link")?.Attribute("href")?.Value
                };

                if (categoryFilter)
                {
                    bool doesMatchCategoryFilter = false;

                    var categoryItems = atomItem.Elements().Where(i => i.Name.LocalName == "category");

                    if (categoryItems.Any())
                    {
                        foreach (var categoryItem in categoryItems)
                        {
                            if (categoryItem.Attribute("term") != null)
                            {
                                var categoryValue = categoryItem.Attribute("term").Value.ToCleanString();

                                if (config.FilterByCategories.Contains(categoryValue, StringComparer.OrdinalIgnoreCase))
                                {
                                    doesMatchCategoryFilter = true;
                                    break;
                                }
                            }
                        }
                    }


                    if (doesMatchCategoryFilter == false)
                    {
                        // exit here, because we applied a category filter, but no filter category was present
                        continue;
                    }
                }

                var summary = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "content")?.Value;

                if (summary != null)
                {
                    if (config.SummaryTruncateAt == 0)
                    {
                        crawlerResultItem.Summary = summary;
                    }
                    else
                    {
                        var contentDoc = new HtmlDocument();
                        contentDoc.LoadHtml(summary);
                        var textContent = contentDoc.DocumentNode.InnerText.Trim();
                        crawlerResultItem.Summary = TruncateAtWord(textContent, config.SummaryTruncateAt);
                    }
                }

                var pubDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "published")?.Value;
                if (DateTime.TryParse(pubDateValue, out DateTime pubDateDateTime))
                {
                    crawlerResultItem.PublishedOn = pubDateDateTime;
                }
                else
                {
                    var updateDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "updated")?.Value;
                    if (DateTime.TryParse(updateDateValue, out DateTime updateDateDateTime))
                    {
                        crawlerResultItem.PublishedOn = updateDateDateTime;
                    }
                }

                XNamespace media             = "http://search.yahoo.com/mrss/";
                var        mediaGroupElement = atomItem.Element(media + "group");
                if (mediaGroupElement != null)
                {
                    var thumbnailElement = mediaGroupElement.Elements().FirstOrDefault(i => i.Name.LocalName == "thumbnail");
                    if (thumbnailElement != null)
                    {
                        crawlerResultItem.Thumbnail = thumbnailElement.Attribute("url").Value;
                    }
                }

                if (config.IncludeRawContent)
                {
                    crawlerResultItem.RawContent = atomItem.ToString();
                }

                if (config.LoadSocialLinkCounters)
                {
                    crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(crawlerResultItem.Href);
                }

                crawlerResult.FeedItems.Add(crawlerResultItem);
            }
        }
Beispiel #2
0
        public async Task <FeedCrawlerResult> DoWorkAsync(FeedCrawlerConfig config)
        {
            if (string.IsNullOrWhiteSpace(config.Url))
            {
                return(new FeedCrawlerResult());
            }

            var crawlerResult = new FeedCrawlerResult();

            crawlerResult.ResultIdentifier = config.ResultIdentifier;
            crawlerResult.FeedItems        = new List <FeedCrawlerResult.FeedItem>();

            var syndicationFeed = _feedAbstraction.Get(config.Url);

            foreach (var feedItem in syndicationFeed.Items.OrderBy(x => x.PublishDate))
            {
                int commentCount = 0;

                foreach (SyndicationElementExtension extension in feedItem.ElementExtensions)
                {
                    var extensionElement = extension.GetObject <XElement>();

                    if (extensionElement.Name.LocalName == "comments" &&
                        extensionElement.Name.NamespaceName == "http://purl.org/rss/1.0/modules/slash/")
                    {
                        commentCount = int.Parse(extensionElement.Value);
                    }
                }

                var crawlerResultItem = new FeedCrawlerResult.FeedItem();
                crawlerResultItem.Title = feedItem.Title.Text;

                if (config.LoadSocialLinkCounters)
                {
                    crawlerResultItem.TweetsCount = await _twitterLoader.GetAsync(feedItem.Id);

                    crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(feedItem.Id);
                }

                crawlerResultItem.CommentsCount = commentCount;

                if (feedItem.Summary != null)
                {
                    crawlerResultItem.Summary = feedItem.Summary.Text;
                }

                if (feedItem.Links.Any())
                {
                    crawlerResultItem.Href = feedItem.Links.First().Uri.ToString();
                }
                else
                {
                    crawlerResultItem.Href = feedItem.Id;
                }

                crawlerResultItem.PublishedOn = feedItem.PublishDate.Date;

                StringBuilder builder = new StringBuilder();
                XmlWriter     writer  = XmlWriter.Create(builder);
                feedItem.SaveAsRss20(writer);
                writer.Close();

                crawlerResultItem.RawContent = builder.ToString();

                crawlerResult.FeedItems.Add(crawlerResultItem);
            }

            return(crawlerResult);
        }