Beispiel #1
0
        private async Task <List <Article> > ScrapeArticles()
        {
            var articles = new List <Article>();

            var feed = await httpHelper.Get(FeedUrl);

            if (string.IsNullOrEmpty(feed))
            {
                throw new Exception($"Could not get feed from url {FeedUrl}");
            }

            var parser = new RssParser();
            var rss    = parser.Parse(feed);
            var newestArticlesSchemas = rss.OrderByDescending(x => x.PublishDate).Take(10);

            foreach (var schema in newestArticlesSchemas)
            {
                var document = await httpHelper.GetDocumentFromUrl(schema.InternalID);

                articles.Add(new Article
                {
                    Source    = NewsSource.PriznajemHr,
                    Guid      = GetGuidFromUrl(schema.InternalID),
                    Title     = schema.Title,
                    Image     = GetArticleImage(document),
                    Text      = await httpHelper.GetArticleText(schema.Content),
                    Summary   = await httpHelper.GetFirstParagraph(schema.Content),
                    Keywords  = keywordHelper.GetKeywordsFromTitle(schema.Title),
                    SourceUrl = schema.InternalID
                });
            }

            return(articles);
        }
Beispiel #2
0
        private async Task <string> GetArticleText(string url)
        {
            var document = await httpHelper.GetDocumentFromUrl(url);

            var paragraphs = document
                             .QuerySelectorAll("div.text > p")
                             .Where(x => x.ChildElementCount == 0);

            var paragraphStringBuilder = new StringBuilder();

            foreach (var par in paragraphs)
            {
                paragraphStringBuilder.Append(par.TextContent + Environment.NewLine + Environment.NewLine);
            }

            return(paragraphStringBuilder.ToString());
        }