Exemple #1
0
        private async Task<byte[]> GetNewsImage(string url)
        {
            if(String.IsNullOrWhiteSpace(url))
            {
                return null;
            }

            var n = await newsCrawler.DownloadSingleNewsAsync(url);

            if(n == null || String.IsNullOrWhiteSpace(n.PictureUrl))
            {
                return null;
            }

            try
            {
                var img = await HttpUtilites.GetBytesByUrl(n.PictureUrl, 60*5);
                
                if (img == null || img.Length == 0 || img.Length > 1024*1024*5)
                {
                    return null;
                }

                return img;
            }
            catch
            {
                return null;
            }
        }
Exemple #2
0
        private async Task Update()
        {
            var newsUrls    = new List <string>();
            var newsCrawler = new NewsCrawler();

            foreach (var cat in newsCatalogues.Select(newsCrawler.CrawlNewsListAsync).ToArray())
            {
                newsUrls.AddRange(await cat);
            }

            if (newsUrls.Count > 0)
            {
                foreach (var url in newsUrls)
                {
                    try
                    {
                        if (await HttpUtilites.IsUrlExists(url))
                        {
                            if (!await newsDatabase.IsNewsExistsByUrl(url))
                            {
                                var newsArticle = await newsCrawler.DownloadSingleNewsAsync(url);

                                if (newsArticle != null)
                                {
                                    if (SimplifyText(newsArticle.Text).SplitDefault().MakeUnique().Count() > MIN_NEWS_TEXT_WORDS_QUANTITY)
                                    {
                                        var latestNews = await newsDatabase.GetLatestNews();

                                        if (CheckHeaderWordsSimilarity(newsArticle.Header, latestNews))
                                        {
                                            logger.Info($"Adding news article: {newsArticle.Url}");

                                            await newsDatabase.AddNews(newsArticle);

                                            await newsDatabase.UpdateNewsArticlesWordsWeights();

                                            await newsDatabase.UpdateWordsWeightsDictionary();

                                            await newsDatabase.FillNewsTags();

                                            await newsDatabase.FillSimilarNews();
                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            logger.Info($"News page {url} - 404 not found.");
                        }
                    }
                    catch (OperationCanceledException)
                    {
                        logger.Info($"News page {url} - timed out");
                    }
                }
            }
        }