private async Task <List <Article> > ScrapeArticles() { var articles = new List <Article>(); var feed = await httpHelper.Get(FeedUrl); if (string.IsNullOrEmpty(feed)) { throw new Exception($"Could not get feed from url {FeedUrl}"); } var parser = new RssParser(); var rss = parser.Parse(feed); var newestArticlesSchemas = rss.OrderByDescending(x => x.PublishDate).Take(10); foreach (var schema in newestArticlesSchemas) { var document = await httpHelper.GetDocumentFromUrl(schema.InternalID); articles.Add(new Article { Source = NewsSource.PriznajemHr, Guid = GetGuidFromUrl(schema.InternalID), Title = schema.Title, Image = GetArticleImage(document), Text = await httpHelper.GetArticleText(schema.Content), Summary = await httpHelper.GetFirstParagraph(schema.Content), Keywords = keywordHelper.GetKeywordsFromTitle(schema.Title), SourceUrl = schema.InternalID }); } return(articles); }
private async Task <string> GetArticleText(string url) { var document = await httpHelper.GetDocumentFromUrl(url); var paragraphs = document .QuerySelectorAll("div.text > p") .Where(x => x.ChildElementCount == 0); var paragraphStringBuilder = new StringBuilder(); foreach (var par in paragraphs) { paragraphStringBuilder.Append(par.TextContent + Environment.NewLine + Environment.NewLine); } return(paragraphStringBuilder.ToString()); }