private async Task <List <Article> > ScrapeArticles() { var articles = new List <Article>(); var feed = await httpHelper.Get(FeedUrl); if (string.IsNullOrEmpty(feed)) { throw new NullReferenceException($"Could not get feed from url {FeedUrl}"); } var parser = new RssParser(); var rss = parser.Parse(feed); var newestArticlesSchemas = rss.OrderByDescending(x => x.PublishDate).Take(10); foreach (var schema in newestArticlesSchemas) { if (schema.Categories.Contains("Vijesti")) { articles.Add(new Article { Source = NewsSource.IndexHr, Guid = GetGuidFromUrl(schema.InternalID), Title = schema.Title, Image = schema.ImageUrl, Text = await GetArticleText(schema.InternalID), Summary = schema.Summary, Keywords = keywordHelper.GetKeywordsFromTitle(schema.Title), SourceUrl = schema.InternalID }); } } return(articles); }
private async Task <List <Article> > ScrapeArticles() { var articles = new List <Article>(); var feed = await httpHelper.Get(FeedUrl); if (string.IsNullOrEmpty(feed)) { throw new Exception($"Could not get feed from url {FeedUrl}"); } var parser = new RssParser(); var rss = parser.Parse(feed); var newestArticlesSchemas = rss.OrderByDescending(x => x.PublishDate).Take(10); foreach (var schema in newestArticlesSchemas) { var document = await httpHelper.GetDocumentFromUrl(schema.InternalID); articles.Add(new Article { Source = NewsSource.PriznajemHr, Guid = GetGuidFromUrl(schema.InternalID), Title = schema.Title, Image = GetArticleImage(document), Text = await httpHelper.GetArticleText(schema.Content), Summary = await httpHelper.GetFirstParagraph(schema.Content), Keywords = keywordHelper.GetKeywordsFromTitle(schema.Title), SourceUrl = schema.InternalID }); } return(articles); }
public void GetKeywordsFromTitle_ReturnsCorrectKeywords() { var helper = new KeywordHelper(); var title = "This,; is: a## test title! čžćđ## 123as sdf-;:\"'+? (FOTO) (VIDEO)"; var keywords = helper.GetKeywordsFromTitle(title); Assert.Equal("THIS,TEST,TITLE,ČŽĆĐ", keywords); }