Пример #1
0
        private void SearchArticles(string baseUrl, List <Article> allArticles, int headerSize = 1)
        {
            var htmlDoc = _spider.LoadPage(baseUrl);
            var headers = _spider.GetHeadersOfSize(htmlDoc, headerSize);

            foreach (var header in headers)
            {
                try
                {
                    header.InnerText.Trim();
                    if (!_validator.ConsideredArticleHeader(header.InnerText))
                    {
                        continue;
                    }

                    var(articleUrl, articleHtmlDocument) = _spider.DownloadArticleByHeader(baseUrl, header);
                    if (articleUrl == null && articleHtmlDocument == null)
                    {
                        continue;
                    }

                    var articleBodyText = _extractor.ExtractBodyTextFromArticleDocument(articleHtmlDocument);
                    var article         = CreateArticle(articleHtmlDocument, articleBodyText, allArticles, baseUrl, articleUrl, header.InnerText);
                    var foundKeywords   = ScanArticleForKeyWords(article);
                    if (foundKeywords.Count() > 0)
                    {
                        article.Keywords = foundKeywords;
                        _pipeline.SendForAnalysis(article);
                    }
                }
                catch (Exception e)
                {
                    _logger.LogError($"Failed to download and process article {header.InnerText} with the following exception {e.Message} stack trace: {e.StackTrace}");
                    continue;
                }
            }
            headerSize++;
            if (headerSize <= 4)
            {
                SearchArticles(baseUrl, allArticles, headerSize);
            }
        }