private async Task ParseAtomFeed(FeedCrawlerConfig config, XDocument doc, FeedResult crawlerResult) { if (doc.Root == null) { return; } var atomItems = doc.Root.Elements() .Where(i => i.Name.LocalName == "entry"); bool categoryFilter = config.FilterByCategories.Any(); foreach (var atomItem in atomItems) { var crawlerResultItem = new FeedResult.FeedItem { Title = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "title")?.Value, Href = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "link")?.Attribute("href")?.Value }; if (categoryFilter) { bool doesMatchCategoryFilter = false; var categoryItems = atomItem.Elements().Where(i => i.Name.LocalName == "category"); if (categoryItems.Any()) { foreach (var categoryItem in categoryItems) { if (categoryItem.Attribute("term") != null) { var categoryValue = categoryItem.Attribute("term").Value.ToCleanString(); if (config.FilterByCategories.Contains(categoryValue, StringComparer.OrdinalIgnoreCase)) { doesMatchCategoryFilter = true; break; } } } } if (doesMatchCategoryFilter == false) { // exit here, because we applied a category filter, but no filter category was present continue; } } var summary = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "content")?.Value; if (summary != null) { if (config.SummaryTruncateAt == 0) { crawlerResultItem.Summary = summary; } else { var contentDoc = new HtmlDocument(); contentDoc.LoadHtml(summary); var textContent = contentDoc.DocumentNode.InnerText.Trim(); crawlerResultItem.Summary = TruncateAtWord(textContent, config.SummaryTruncateAt); } } var pubDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "published")?.Value; if (DateTime.TryParse(pubDateValue, out DateTime pubDateDateTime)) { crawlerResultItem.PublishedOn = pubDateDateTime; } else { var updateDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "updated")?.Value; if (DateTime.TryParse(updateDateValue, out DateTime updateDateDateTime)) { crawlerResultItem.PublishedOn = updateDateDateTime; } } XNamespace media = "http://search.yahoo.com/mrss/"; var mediaGroupElement = atomItem.Element(media + "group"); if (mediaGroupElement != null) { var thumbnailElement = mediaGroupElement.Elements().FirstOrDefault(i => i.Name.LocalName == "thumbnail"); if (thumbnailElement != null) { crawlerResultItem.Thumbnail = thumbnailElement.Attribute("url").Value; } } if (config.IncludeRawContent) { crawlerResultItem.RawContent = atomItem.ToString(); } if (config.LoadSocialLinkCounters) { crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(crawlerResultItem.Href); } crawlerResult.FeedItems.Add(crawlerResultItem); } }
public async Task <FeedCrawlerResult> DoWorkAsync(FeedCrawlerConfig config) { if (string.IsNullOrWhiteSpace(config.Url)) { return(new FeedCrawlerResult()); } var crawlerResult = new FeedCrawlerResult(); crawlerResult.ResultIdentifier = config.ResultIdentifier; crawlerResult.FeedItems = new List <FeedCrawlerResult.FeedItem>(); var syndicationFeed = _feedAbstraction.Get(config.Url); foreach (var feedItem in syndicationFeed.Items.OrderBy(x => x.PublishDate)) { int commentCount = 0; foreach (SyndicationElementExtension extension in feedItem.ElementExtensions) { var extensionElement = extension.GetObject <XElement>(); if (extensionElement.Name.LocalName == "comments" && extensionElement.Name.NamespaceName == "http://purl.org/rss/1.0/modules/slash/") { commentCount = int.Parse(extensionElement.Value); } } var crawlerResultItem = new FeedCrawlerResult.FeedItem(); crawlerResultItem.Title = feedItem.Title.Text; if (config.LoadSocialLinkCounters) { crawlerResultItem.TweetsCount = await _twitterLoader.GetAsync(feedItem.Id); crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(feedItem.Id); } crawlerResultItem.CommentsCount = commentCount; if (feedItem.Summary != null) { crawlerResultItem.Summary = feedItem.Summary.Text; } if (feedItem.Links.Any()) { crawlerResultItem.Href = feedItem.Links.First().Uri.ToString(); } else { crawlerResultItem.Href = feedItem.Id; } crawlerResultItem.PublishedOn = feedItem.PublishDate.Date; StringBuilder builder = new StringBuilder(); XmlWriter writer = XmlWriter.Create(builder); feedItem.SaveAsRss20(writer); writer.Close(); crawlerResultItem.RawContent = builder.ToString(); crawlerResult.FeedItems.Add(crawlerResultItem); } return(crawlerResult); }