Esempio n. 1
0
        private async Task ParseRssFeed(FeedCrawlerConfig config, XDocument doc, FeedResult crawlerResult)
        {
            if (doc.Root == null)
            {
                return;
            }

            var rssItems = doc.Root.Descendants().First(i => i.Name.LocalName == "channel").Elements()
                           .Where(i => i.Name.LocalName == "item");

            bool categoryFilter = config.FilterByCategories.Any();

            foreach (var rssItem in rssItems)
            {
                var crawlerResultItem = new FeedResult.FeedItem
                {
                    Title = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "title")?.Value.ToCleanString(),
                    Href  = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "link")?.Value.ToCleanString()
                };

                if (categoryFilter)
                {
                    bool doesMatchCategoryFilter = false;

                    var categoryItems = rssItem.Elements().Where(i => i.Name.LocalName == "category");

                    if (categoryItems.Any())
                    {
                        foreach (var categoryItem in categoryItems)
                        {
                            var categoryValue = categoryItem.Value.ToCleanString();

                            if (config.FilterByCategories.Contains(categoryValue, StringComparer.OrdinalIgnoreCase))
                            {
                                doesMatchCategoryFilter = true;
                                break;
                            }
                        }
                    }


                    if (doesMatchCategoryFilter == false)
                    {
                        // exit here, because we applied a category filter, but no filter category was present
                        continue;
                    }
                }

                var summary = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "description")?.Value.ToCleanString();
                if (config.SummaryTruncateAt == 0)
                {
                    crawlerResultItem.Summary = summary;
                }
                else
                {
                    var contentDoc = new HtmlDocument();
                    contentDoc.LoadHtml(summary);
                    var textContent = contentDoc.DocumentNode.InnerText.Trim();
                    crawlerResultItem.Summary = TruncateAtWord(textContent, config.SummaryTruncateAt);
                }

                var pubDateValue = rssItem.Elements().FirstOrDefault(i => i.Name.LocalName == "pubDate")?.Value;
                if (DateTime.TryParse(pubDateValue, out DateTime pubDateDateTime))
                {
                    crawlerResultItem.PublishedOn = pubDateDateTime;
                }

                if (config.IncludeRawContent)
                {
                    crawlerResultItem.RawContent = rssItem.ToString();
                }

                var commentValue = rssItem.Elements().FirstOrDefault(i =>
                                                                     i.Name.LocalName == "comments" && i.Name.NamespaceName == "http://purl.org/rss/1.0/modules/slash/")?.Value;
                if (int.TryParse(commentValue, out int commentInt))
                {
                    crawlerResultItem.CommentsCount = commentInt;
                }

                if (config.LoadSocialLinkCounters)
                {
                    crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(crawlerResultItem.Href);
                }

                crawlerResult.FeedItems.Add(crawlerResultItem);
            }
        }
Esempio n. 2
0
        private async Task ParseAtomFeed(FeedCrawlerConfig config, XDocument doc, FeedResult crawlerResult)
        {
            if (doc.Root == null)
            {
                return;
            }

            var atomItems = doc.Root.Elements()
                            .Where(i => i.Name.LocalName == "entry");

            bool categoryFilter = config.FilterByCategories.Any();

            foreach (var atomItem in atomItems)
            {
                var crawlerResultItem = new FeedResult.FeedItem
                {
                    Title = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "title")?.Value,
                    Href  = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "link")?.Attribute("href")?.Value
                };

                if (categoryFilter)
                {
                    bool doesMatchCategoryFilter = false;

                    var categoryItems = atomItem.Elements().Where(i => i.Name.LocalName == "category");

                    if (categoryItems.Any())
                    {
                        foreach (var categoryItem in categoryItems)
                        {
                            if (categoryItem.Attribute("term") != null)
                            {
                                var categoryValue = categoryItem.Attribute("term").Value.ToCleanString();

                                if (config.FilterByCategories.Contains(categoryValue, StringComparer.OrdinalIgnoreCase))
                                {
                                    doesMatchCategoryFilter = true;
                                    break;
                                }
                            }
                        }
                    }


                    if (doesMatchCategoryFilter == false)
                    {
                        // exit here, because we applied a category filter, but no filter category was present
                        continue;
                    }
                }

                var summary = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "content")?.Value;

                if (summary != null)
                {
                    if (config.SummaryTruncateAt == 0)
                    {
                        crawlerResultItem.Summary = summary;
                    }
                    else
                    {
                        var contentDoc = new HtmlDocument();
                        contentDoc.LoadHtml(summary);
                        var textContent = contentDoc.DocumentNode.InnerText.Trim();
                        crawlerResultItem.Summary = TruncateAtWord(textContent, config.SummaryTruncateAt);
                    }
                }

                var pubDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "published")?.Value;
                if (DateTime.TryParse(pubDateValue, out DateTime pubDateDateTime))
                {
                    crawlerResultItem.PublishedOn = pubDateDateTime;
                }
                else
                {
                    var updateDateValue = atomItem.Elements().FirstOrDefault(i => i.Name.LocalName == "updated")?.Value;
                    if (DateTime.TryParse(updateDateValue, out DateTime updateDateDateTime))
                    {
                        crawlerResultItem.PublishedOn = updateDateDateTime;
                    }
                }

                XNamespace media             = "http://search.yahoo.com/mrss/";
                var        mediaGroupElement = atomItem.Element(media + "group");
                if (mediaGroupElement != null)
                {
                    var thumbnailElement = mediaGroupElement.Elements().FirstOrDefault(i => i.Name.LocalName == "thumbnail");
                    if (thumbnailElement != null)
                    {
                        crawlerResultItem.Thumbnail = thumbnailElement.Attribute("url").Value;
                    }
                }

                if (config.IncludeRawContent)
                {
                    crawlerResultItem.RawContent = atomItem.ToString();
                }

                if (config.LoadSocialLinkCounters)
                {
                    crawlerResultItem.FacebookCount = await _facebookLoader.GetAsync(crawlerResultItem.Href);
                }

                crawlerResult.FeedItems.Add(crawlerResultItem);
            }
        }