private List <ParsingResult <Article> > GetArticlesFromDocument(IHtmlDocument document, int requestDelay)
        {
            List <ParsingResult <Article> > resultArticles = new List <ParsingResult <Article> >();

            try
            {
                foreach (string url in GetArticleUrlsFromDocument())
                {
                    IHtmlDocument articleDocument = GetDocumentFromUrl(url, requestDelay);

                    Article article = GetArticleFromDocument(url, articleDocument);

                    ParsingResultType type = ParsingResultType.AlreadyExists;
                    if (!_existingArticles.Contains(article.RelativeUrl))
                    {
                        _existingArticles.Add(article.RelativeUrl);
                        type = ParsingResultType.PendingForSave;
                    }
                    else
                    {
                        //debug
                    }

                    resultArticles.Add(new ParsingResult <Article>(article, type));
                }
            }
            catch (Exception ex)
            {
                _logger.LogError("Unexpected exception when parsing articles", ex);
            }

            return(resultArticles);

            IEnumerable <string> GetArticleUrlsFromDocument()
            {
                IHtmlCollection <IElement> anchors = document.QuerySelectorAll("article[id] .headline-medium a");

                foreach (var element in anchors)
                {
                    yield return($"{RootPath}{element.GetAttribute("href")}");
                }
            }

            Article GetArticleFromDocument(string documentUrl, IHtmlDocument articleDocument)
            {
                var    analysisElement = articleDocument.QuerySelector(".ARTIKEL>article>.analysis");
                var    headlineElement = articleDocument.QuerySelector(".ARTIKEL>article>.headline-large");
                var    timeElement     = articleDocument.QuerySelector(".ARTIKEL>article>cite>time");
                var    authorElement   = articleDocument.QuerySelector(".ARTIKEL>article>cite>span");
                var    bodyElement     = articleDocument.QuerySelector(".ARTIKEL>article>div>div[property=\"schema:articleBody\"]");
                string relativeUrl     = new Uri(documentUrl).LocalPath;

                string analysis = analysisElement.ClassList.Intersect(_analysisValues).FirstOrDefault();
                string headline = headlineElement.TextContent;
                string time     = timeElement.TextContent;
                string author   = authorElement.TextContent;
                string body     = bodyElement.TextContent;

                return(new Article(analysis?.Trim(), headline.Trim(), relativeUrl, time.Trim(), author.Trim(), body.Trim()));
            }
        }
Example #2
0
 public ParsingResult(T entity, ParsingResultType resultType)
 {
     Entity     = entity;
     ResultType = resultType;
 }