Ejemplo n.º 1
0
        private async Task Update()
        {
            var newsUrls    = new List <string>();
            var newsCrawler = new NewsCrawler();

            foreach (var cat in newsCatalogues.Select(newsCrawler.CrawlNewsListAsync).ToArray())
            {
                newsUrls.AddRange(await cat);
            }

            if (newsUrls.Count > 0)
            {
                foreach (var url in newsUrls)
                {
                    try
                    {
                        if (await HttpUtilites.IsUrlExists(url))
                        {
                            if (!await newsDatabase.IsNewsExistsByUrl(url))
                            {
                                var newsArticle = await newsCrawler.DownloadSingleNewsAsync(url);

                                if (newsArticle != null)
                                {
                                    if (SimplifyText(newsArticle.Text).SplitDefault().MakeUnique().Count() > MIN_NEWS_TEXT_WORDS_QUANTITY)
                                    {
                                        var latestNews = await newsDatabase.GetLatestNews();

                                        if (CheckHeaderWordsSimilarity(newsArticle.Header, latestNews))
                                        {
                                            logger.Info($"Adding news article: {newsArticle.Url}");

                                            await newsDatabase.AddNews(newsArticle);

                                            await newsDatabase.UpdateNewsArticlesWordsWeights();

                                            await newsDatabase.UpdateWordsWeightsDictionary();

                                            await newsDatabase.FillNewsTags();

                                            await newsDatabase.FillSimilarNews();
                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            logger.Info($"News page {url} - 404 not found.");
                        }
                    }
                    catch (OperationCanceledException)
                    {
                        logger.Info($"News page {url} - timed out");
                    }
                }
            }
        }
Ejemplo n.º 2
0
        public async Task <NewsArticle> DownloadSingleNewsAsync(string articleUrl)
        {
            logger.Info($"Downloading article {articleUrl}");

            NewsArticle na = null;

            try
            {
                HttpRequestResult result = await HttpRequest.
                                           HttpGetRequest(articleUrl,
                                                          "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
                                                          allowRedirect : true);

                logger.Info($"Have downloaded article {articleUrl}");

                if (result.HttpStatusCode == HttpStatusCode.OK)
                {
                    var parser   = new HtmlParser();
                    var document = await parser.ParseAsync(result.Data);

                    var articleHeader = document.QuerySelector("div.primary_content h1");
                    var articleDate   = document.QuerySelector("div.primary_content div.b-text-content div.news_date");
                    var articleHtml   = document.QuerySelector("div.primary_content div article");

                    if (articleHeader != null && articleDate != null && articleHtml != null)
                    {
                        var text = new List <string>();

                        foreach (var p in articleHtml.TextContent.SplitNoEmptyTrim(new char[] { '\n', '\r' }))
                        {
                            var str = p.TrimAndCompactWhitespaces();

                            if (ExceedFragments.Any(_ => str.ToLowerInvariant() == _.ToLowerInvariant()))
                            {
                                break;
                            }

                            if (StopWords.Any(_ => str.ToLowerInvariant() == _.ToLowerInvariant()))
                            {
                                continue;
                            }

                            if (str.Length > 0)
                            {
                                text.Add(str);
                            }
                        }

                        var pic = articleHtml.QuerySelector("img")?.GetAttribute("src");

                        if (pic != null)
                        {
                            pic = WebUtility.UrlDecode(pic);
                            pic = CompoundUrl(new Uri(articleUrl), Uri.EscapeUriString(pic));
                            pic = await HttpUtilites.IsUrlExists(pic) ? pic : null;
                        }

                        if (text.Count == 0)
                        {
                            logger.Warn($"News article doesn't have content, {articleUrl}");
                            return(na);
                        }

                        var headerText = articleHeader.TextContent.TrimAndCompactWhitespaces();

                        if (String.IsNullOrWhiteSpace(headerText))
                        {
                            logger.Warn($"News article doesn't have header, {articleUrl}");
                            return(na);
                        }

                        DateTime date;

                        try
                        {
                            date = DateTime.ParseExact(
                                RetrieveDayDate(articleDate.TextContent),
                                "dd.MM.yyyy",
                                CultureInfo.InvariantCulture);
                        }
                        catch
                        {
                            date = DateTime.Now;
                        }

                        na = new NewsArticle
                        {
                            Header     = headerText,
                            Text       = string.Join("\n", text),
                            Url        = articleUrl,
                            Date       = date,
                            PictureUrl = pic
                        };
                    }
                    else
                    {
                        logger.Warn($"News article doesn't contain neccessary item, {articleUrl}");
                    }
                }
                else
                {
                    logger.Warn($"Incorrect HTTP code appeared while downloading article: {result.HttpStatusCode}, {articleUrl}");
                }
            }
            catch (Exception e)
            {
                logger.Error($"While downloading article {articleUrl}", e);
            }

            return(na);
        }