private async Task Update() { var newsUrls = new List <string>(); var newsCrawler = new NewsCrawler(); foreach (var cat in newsCatalogues.Select(newsCrawler.CrawlNewsListAsync).ToArray()) { newsUrls.AddRange(await cat); } if (newsUrls.Count > 0) { foreach (var url in newsUrls) { try { if (await HttpUtilites.IsUrlExists(url)) { if (!await newsDatabase.IsNewsExistsByUrl(url)) { var newsArticle = await newsCrawler.DownloadSingleNewsAsync(url); if (newsArticle != null) { if (SimplifyText(newsArticle.Text).SplitDefault().MakeUnique().Count() > MIN_NEWS_TEXT_WORDS_QUANTITY) { var latestNews = await newsDatabase.GetLatestNews(); if (CheckHeaderWordsSimilarity(newsArticle.Header, latestNews)) { logger.Info($"Adding news article: {newsArticle.Url}"); await newsDatabase.AddNews(newsArticle); await newsDatabase.UpdateNewsArticlesWordsWeights(); await newsDatabase.UpdateWordsWeightsDictionary(); await newsDatabase.FillNewsTags(); await newsDatabase.FillSimilarNews(); } } } } } else { logger.Info($"News page {url} - 404 not found."); } } catch (OperationCanceledException) { logger.Info($"News page {url} - timed out"); } } } }
public async Task <NewsArticle> DownloadSingleNewsAsync(string articleUrl) { logger.Info($"Downloading article {articleUrl}"); NewsArticle na = null; try { HttpRequestResult result = await HttpRequest. HttpGetRequest(articleUrl, "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", allowRedirect : true); logger.Info($"Have downloaded article {articleUrl}"); if (result.HttpStatusCode == HttpStatusCode.OK) { var parser = new HtmlParser(); var document = await parser.ParseAsync(result.Data); var articleHeader = document.QuerySelector("div.primary_content h1"); var articleDate = document.QuerySelector("div.primary_content div.b-text-content div.news_date"); var articleHtml = document.QuerySelector("div.primary_content div article"); if (articleHeader != null && articleDate != null && articleHtml != null) { var text = new List <string>(); foreach (var p in articleHtml.TextContent.SplitNoEmptyTrim(new char[] { '\n', '\r' })) { var str = p.TrimAndCompactWhitespaces(); if (ExceedFragments.Any(_ => str.ToLowerInvariant() == _.ToLowerInvariant())) { break; } if (StopWords.Any(_ => str.ToLowerInvariant() == _.ToLowerInvariant())) { continue; } if (str.Length > 0) { text.Add(str); } } var pic = articleHtml.QuerySelector("img")?.GetAttribute("src"); if (pic != null) { pic = WebUtility.UrlDecode(pic); pic = CompoundUrl(new Uri(articleUrl), Uri.EscapeUriString(pic)); pic = await HttpUtilites.IsUrlExists(pic) ? pic : null; } if (text.Count == 0) { logger.Warn($"News article doesn't have content, {articleUrl}"); return(na); } var headerText = articleHeader.TextContent.TrimAndCompactWhitespaces(); if (String.IsNullOrWhiteSpace(headerText)) { logger.Warn($"News article doesn't have header, {articleUrl}"); return(na); } DateTime date; try { date = DateTime.ParseExact( RetrieveDayDate(articleDate.TextContent), "dd.MM.yyyy", CultureInfo.InvariantCulture); } catch { date = DateTime.Now; } na = new NewsArticle { Header = headerText, Text = string.Join("\n", text), Url = articleUrl, Date = date, PictureUrl = pic }; } else { logger.Warn($"News article doesn't contain neccessary item, {articleUrl}"); } } else { logger.Warn($"Incorrect HTTP code appeared while downloading article: {result.HttpStatusCode}, {articleUrl}"); } } catch (Exception e) { logger.Error($"While downloading article {articleUrl}", e); } return(na); }