public async Task <ScrapeResultDto <ArticleItemDto> > GetArticles(string url, bool includeAbstract, CancellationToken cancellationToken) { ScrapeResultDto <ArticleItemDto> result = new ScrapeResultDto <ArticleItemDto>() { RequestUrl = url }; HtmlDocument htmlDocument = await HtmlAgilityHelper.GetHtmlDocument(url, cancellationToken); return(htmlDocument == null ? result : ScrapeArticleList(result, htmlDocument, includeAbstract)); }
public void CatchupArchiveArticles_ExtractsArxivIdLabelText_AllArticlesLabeledAsReplaced() { HtmlDocument doc = _articleListTestBuilder.ArticleList_Catchup_SubjectGroup_HtmlDocument(); ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>(); int expected = 1010; var sut = _scraper.ScrapeCatchUpArticleList(request, doc, true) .Result .Where(r => r.ArxivIdLabel == "replaced").Count(); sut.Should().Be(expected); }
public void CatchupArchiveArticles_Extracts_AllArticlesIn_HtmlDocument() { HtmlDocument doc = _articleListTestBuilder.ArticleList_Catchup_SubjectGroup_HtmlDocument(); ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>(); int expected = 2045; var sut = _scraper.ScrapeCatchUpArticleList(request, doc, true); sut.Should().NotBeNull(); sut.Result.Should().HaveCountGreaterOrEqualTo(2); sut.Result.Count().Should().Be(expected); }
public void Gets_article_list_count_as_dto_collection() { //Arrange HtmlDocument doc = _articleListTestBuilder.ArticleList_withoutAbstract_HtmlDocument(); ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>(); int expected = 25; //Act var sut = _scraper.ScrapeArticleList(request, doc, false); //Assert sut.Should().NotBeNull(); sut.Result.Count().Should().Be(expected); }
public async Task <ScrapeResultDto <Article> > SubmissionsBySubjectCodeAsync(string url, string subjectCode, CancellationToken cancellationToken) { ScrapeResultDto <Article> newSubmissionResult = new ScrapeResultDto <Article>(); try { ScrapeResultDto <ArticleItemDto> scrapeResult = await _articleListScraper.GetArticles(url, true, cancellationToken); newSubmissionResult.ContinueUrl = scrapeResult.ContinueUrl; newSubmissionResult.RequestUrl = scrapeResult.RequestUrl; List <Article> articles = MapArticlesToDomain(scrapeResult.Result); if (articles == null) { return(newSubmissionResult); } //Only include Submisisons List <Article> submissions = articles .Where(a => a.ScrapeContext == ArticleScrapeContextEnum.Submission) ?.ToList(); //Fetch existing Articles scraped today var existingIds = _context.SubjectItemArticles .Where(j => j.Article.DisplayDate.Date == DateTime.Now.Date //&& j.SubjectItem.Code == subjectCode && j.SubjectItem.IsPrimary) ?.Select(a => a.Article.ArxivId) ?.ToList(); //Filter out new articles not in database List <Article> newSubmissions = submissions .Where(a => !existingIds.Contains(a.ArxivId)) .ToList(); int repoResult = _repo.SaveBySubjectGroup(newSubmissions); newSubmissionResult.IsSucess = repoResult >= 1; newSubmissionResult.Result = newSubmissions; return(newSubmissionResult); } catch (Exception ex) { newSubmissionResult.Exception = ex; return(newSubmissionResult); } }
public void Gets_article_abstract_text() { //Arrange HtmlDocument doc = _articleListTestBuilder.ArticleList_withAbstract_HtmlDocument(); ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>(); string expected = _articleListTestBuilder.WithAbstractHtml_ArticleItem2_AbstractText(); //Act var sut = _scraper.ScrapeArticleList(request, doc, true); //Assert sut.Result.Should().NotBeEmpty(); sut.Result.Should().HaveCountGreaterOrEqualTo(2); Assert.Equal(sut.Result[1].AbstractText, expected); }
public async Task <ScrapeResultDto <Article> > CatchupBySubjectGroupAsync(string url, CancellationToken cancellationToken) { ScrapeResultDto <Article> catcupResult = new ScrapeResultDto <Article>(); try { ScrapeResultDto <ArticleItemDto> dtoResult = await _articleListScraper.GetCatchUpArticles(url, true, cancellationToken); catcupResult.ContinueUrl = dtoResult.ContinueUrl; catcupResult.RequestUrl = dtoResult.RequestUrl; List <Article> articles = MapArticlesToDomain(dtoResult.Result); if (articles == null || articles.Count == 0) { return(catcupResult); } // Only include in Catchup only List <Article> catchups = catcupResult.Result .Where(a => a.ScrapeContext == ArticleScrapeContextEnum.CatchUp) ?.ToList(); //Filter existing Articles var existingArxivIds = await _context.SubjectItemArticles .Where(j => j.Article.DisplayDate.Date == DateTime.Now.Date) ?.Select(a => a.Article.ArxivId) ?.ToListAsync(); List <Article> newCatchups = catchups .Where(a => !existingArxivIds.Contains(a.ArxivId)) .ToList(); //Persist to Database int success = _repo.SaveBySubjectGroup(newCatchups); catcupResult.IsSucess = success >= 1; catcupResult.Result = newCatchups; return(catcupResult); } catch (Exception ex) { catcupResult.Exception = ex; return(catcupResult); } }
public ScrapeResultDto <ArticleItemDto> ScrapeArticleList(ScrapeResultDto <ArticleItemDto> request, HtmlDocument htmlDocument, bool includeAbstract = true) { try { var parseMain = (from info in htmlDocument.DocumentNode.SelectNodes("//div[@id='dlpage']") from h3Element in info.SelectNodes("h3") let dlElement = h3Element.GetNextSibling("dl") let dte = dlElement.SelectNodes($"//span[@class='list-identifier']") let dde = dlElement.SelectNodes($"//div[@class='meta']") where dlElement != null select new { h3 = h3Element, dl = dlElement, dt = dlElement.Descendants("dt"), dd = dlElement.Descendants("dd") }).ToList(); if (parseMain == null || parseMain.Count() < 1) { return(request); } for (int i = 0; i < parseMain.Count(); i++) { var h3Elem = parseMain[i].h3.InnerText; var dt_list = parseMain[i].dl.Descendants("dt").ToArray(); var dd_list = parseMain[i].dl.Descendants("dd").ToArray(); var list = ProcessElems(h3Elem, dt_list, dd_list, includeAbstract); if (list != null && list.Count > 0) { request.Result.AddRange(list); request.IsSucess = true; } } } catch (Exception) { request.IsSucess = false; } return(request); }
public ScrapeResultDto <ArticleItemDto> ScrapeCatchUpArticleList(ScrapeResultDto <ArticleItemDto> request, HtmlDocument htmlDocument, bool includeAbstract = true) { try { var parsedHtml = (from info in htmlDocument.DocumentNode.SelectNodes("//div[@id='dlpage']") from dlElement in info.SelectNodes("dl") let olElement = info.SelectSingleNode("ol") let h1Page = info.SelectSingleNode("h1") let parentH3 = dlElement.GetPreviousSibling("h2") select new { ol = olElement, h1 = h1Page, date_info = parentH3, dl = dlElement }).ToList(); if (parsedHtml == null || parsedHtml.Count < 0) { return(request); } request.ContinueUrl = GetCatchUpContinueUrl(parsedHtml[0]?.ol?.ChildNodes); //Html Page title information -used to eastablish the Scrape Context Enum (New, Recent, Catchup etc...) var pageInfo = parsedHtml[0].h1?.InnerText; for (int i = 0; i < parsedHtml.Count(); i++) { var list = ProcessCatchUpElements( pageInfo, parsedHtml[i].date_info?.InnerText, parsedHtml[i].dl.Descendants("dt").ToArray(), parsedHtml[i].dl.Descendants("dd").ToArray(), true); request.Result.AddRange(list); } } catch (Exception) { } return(request); }