private List <ParsingResult <Article> > GetArticlesFromDocument(IHtmlDocument document, int requestDelay) { List <ParsingResult <Article> > resultArticles = new List <ParsingResult <Article> >(); try { foreach (string url in GetArticleUrlsFromDocument()) { IHtmlDocument articleDocument = GetDocumentFromUrl(url, requestDelay); Article article = GetArticleFromDocument(url, articleDocument); ParsingResultType type = ParsingResultType.AlreadyExists; if (!_existingArticles.Contains(article.RelativeUrl)) { _existingArticles.Add(article.RelativeUrl); type = ParsingResultType.PendingForSave; } else { //debug } resultArticles.Add(new ParsingResult <Article>(article, type)); } } catch (Exception ex) { _logger.LogError("Unexpected exception when parsing articles", ex); } return(resultArticles); IEnumerable <string> GetArticleUrlsFromDocument() { IHtmlCollection <IElement> anchors = document.QuerySelectorAll("article[id] .headline-medium a"); foreach (var element in anchors) { yield return($"{RootPath}{element.GetAttribute("href")}"); } } Article GetArticleFromDocument(string documentUrl, IHtmlDocument articleDocument) { var analysisElement = articleDocument.QuerySelector(".ARTIKEL>article>.analysis"); var headlineElement = articleDocument.QuerySelector(".ARTIKEL>article>.headline-large"); var timeElement = articleDocument.QuerySelector(".ARTIKEL>article>cite>time"); var authorElement = articleDocument.QuerySelector(".ARTIKEL>article>cite>span"); var bodyElement = articleDocument.QuerySelector(".ARTIKEL>article>div>div[property=\"schema:articleBody\"]"); string relativeUrl = new Uri(documentUrl).LocalPath; string analysis = analysisElement.ClassList.Intersect(_analysisValues).FirstOrDefault(); string headline = headlineElement.TextContent; string time = timeElement.TextContent; string author = authorElement.TextContent; string body = bodyElement.TextContent; return(new Article(analysis?.Trim(), headline.Trim(), relativeUrl, time.Trim(), author.Trim(), body.Trim())); } }
public ParsingResult(T entity, ParsingResultType resultType) { Entity = entity; ResultType = resultType; }