private async Task ParseRssFeed(FeedCrawlerConfig config, XDocument doc, FeedResult crawlerResult) { if (doc.Root == null) { return; } var rssItems = doc.Root.Descendants().First(i => i.Name.LocalName == "channel").Elements() .Where(i => i.Name.LocalName == "item"); bool categoryFilter = config.FilterByCategories.Any(); foreach (var rssItem in rssItems) { var crawlerResultItem = new FeedResult.FeedItem { Title = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "title")?.Value.ToCleanString(), Href = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "link")?.Value.ToCleanString() }; if (categoryFilter) { bool doesMatchCategoryFilter = false; var categoryItems = rssItem.Elements().Where(i => i.Name.LocalName == "category"); if (categoryItems.Any()) { foreach (var categoryItem in categoryItems) { var categoryValue = categoryItem.Value.ToCleanString(); if (config.FilterByCategories.Contains(categoryValue, StringComparer.OrdinalIgnoreCase)) { doesMatchCategoryFilter = true; break; } } } if (doesMatchCategoryFilter == false) { // exit here, because we applied a category filter, but no filter category was present continue; } } var summary = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "description")?.Value.ToCleanString(); if (config.SummaryTruncateAt == 0) { crawlerResultItem.Summary = summary; } else { var contentDoc = new HtmlDocument(); contentDoc.LoadHtml(summary); var textContent = contentDoc.DocumentNode.InnerText.Trim(); crawlerResultItem.Summary = TruncateAtWord(textContent, config.SummaryTruncateAt); } var pubDateValue = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "pubDate")?.Value; if (DateTime.TryParse(pubDateValue, out DateTime pubDateDateTime)) { crawlerResultItem.PublishedOn = pubDateDateTime; } if (config.IncludeRawContent) { crawlerResultItem.RawContent = rssItem.ToString(); } var commentValue = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "comments" && i.Name.NamespaceName == "http://purl.org/rss/1.0/modules/slash/")?.Value; if (int.TryParse(commentValue, out int commentInt)) { crawlerResultItem.CommentsCount = commentInt; } if (config.LoadSocialLinkCounters) { crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(crawlerResultItem.Href); } crawlerResult.FeedItems.Add(crawlerResultItem); } }
private async Task ParseAtomFeed(FeedCrawlerConfig config, XDocument doc, FeedResult crawlerResult) { if (doc.Root == null) { return; } var atomItems = doc.Root.Elements() .Where(i => i.Name.LocalName == "entry"); bool categoryFilter = config.FilterByCategories.Any(); foreach (var atomItem in atomItems) { var crawlerResultItem = new FeedResult.FeedItem { Title = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "title")?.Value, Href = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "link")?.Attribute("href")?.Value }; if (categoryFilter) { bool doesMatchCategoryFilter = false; var categoryItems = atomItem.Elements().Where(i => i.Name.LocalName == "category"); if (categoryItems.Any()) { foreach (var categoryItem in categoryItems) { if (categoryItem.Attribute("term") != null) { var categoryValue = categoryItem.Attribute("term").Value.ToCleanString(); if (config.FilterByCategories.Contains(categoryValue, StringComparer.OrdinalIgnoreCase)) { doesMatchCategoryFilter = true; break; } } } } if (doesMatchCategoryFilter == false) { // exit here, because we applied a category filter, but no filter category was present continue; } } var summary = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "content")?.Value; if (summary != null) { if (config.SummaryTruncateAt == 0) { crawlerResultItem.Summary = summary; } else { var contentDoc = new HtmlDocument(); contentDoc.LoadHtml(summary); var textContent = contentDoc.DocumentNode.InnerText.Trim(); crawlerResultItem.Summary = TruncateAtWord(textContent, config.SummaryTruncateAt); } } var pubDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "published")?.Value; if (DateTime.TryParse(pubDateValue, out DateTime pubDateDateTime)) { crawlerResultItem.PublishedOn = pubDateDateTime; } else { var updateDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "updated")?.Value; if (DateTime.TryParse(updateDateValue, out DateTime updateDateDateTime)) { crawlerResultItem.PublishedOn = updateDateDateTime; } } XNamespace media = "http://search.yahoo.com/mrss/"; var mediaGroupElement = atomItem.Element(media + "group"); if (mediaGroupElement != null) { var thumbnailElement = mediaGroupElement.Elements().FirstOrDefault(i => i.Name.LocalName == "thumbnail"); if (thumbnailElement != null) { crawlerResultItem.Thumbnail = thumbnailElement.Attribute("url").Value; } } if (config.IncludeRawContent) { crawlerResultItem.RawContent = atomItem.ToString(); } if (config.LoadSocialLinkCounters) { crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(crawlerResultItem.Href); } crawlerResult.FeedItems.Add(crawlerResultItem); } }