private async Task Update() { var newsUrls = new List <string>(); var newsCrawler = new NewsCrawler(); foreach (var cat in newsCatalogues.Select(newsCrawler.CrawlNewsListAsync).ToArray()) { newsUrls.AddRange(await cat); } if (newsUrls.Count > 0) { foreach (var url in newsUrls) { try { if (await HttpUtilites.IsUrlExists(url)) { if (!await newsDatabase.IsNewsExistsByUrl(url)) { var newsArticle = await newsCrawler.DownloadSingleNewsAsync(url); if (newsArticle != null) { if (SimplifyText(newsArticle.Text).SplitDefault().MakeUnique().Count() > MIN_NEWS_TEXT_WORDS_QUANTITY) { var latestNews = await newsDatabase.GetLatestNews(); if (CheckHeaderWordsSimilarity(newsArticle.Header, latestNews)) { logger.Info($"Adding news article: {newsArticle.Url}"); await newsDatabase.AddNews(newsArticle); await newsDatabase.UpdateNewsArticlesWordsWeights(); await newsDatabase.UpdateWordsWeightsDictionary(); await newsDatabase.FillNewsTags(); await newsDatabase.FillSimilarNews(); } } } } } else { logger.Info($"News page {url} - 404 not found."); } } catch (OperationCanceledException) { logger.Info($"News page {url} - timed out"); } } } }
private NewsTweeter() { this.newsDatabase = NewsDatabase.Instance; this.random = new Random(); this.newsCrawler = new NewsCrawler(); }