Beispiel #1
0
        public async Task <ScrapeResultDto <ArticleItemDto> > GetArticles(string url, bool includeAbstract, CancellationToken cancellationToken)
        {
            ScrapeResultDto <ArticleItemDto> result = new ScrapeResultDto <ArticleItemDto>()
            {
                RequestUrl = url
            };

            HtmlDocument htmlDocument = await HtmlAgilityHelper.GetHtmlDocument(url, cancellationToken);

            return(htmlDocument == null ? result :  ScrapeArticleList(result, htmlDocument, includeAbstract));
        }
        public void CatchupArchiveArticles_ExtractsArxivIdLabelText_AllArticlesLabeledAsReplaced()
        {
            HtmlDocument doc = _articleListTestBuilder.ArticleList_Catchup_SubjectGroup_HtmlDocument();
            ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>();
            int expected = 1010;

            var sut = _scraper.ScrapeCatchUpArticleList(request, doc, true)
                      .Result
                      .Where(r => r.ArxivIdLabel == "replaced").Count();

            sut.Should().Be(expected);
        }
        public void CatchupArchiveArticles_Extracts_AllArticlesIn_HtmlDocument()
        {
            HtmlDocument doc = _articleListTestBuilder.ArticleList_Catchup_SubjectGroup_HtmlDocument();
            ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>();
            int expected = 2045;

            var sut = _scraper.ScrapeCatchUpArticleList(request, doc, true);

            sut.Should().NotBeNull();
            sut.Result.Should().HaveCountGreaterOrEqualTo(2);
            sut.Result.Count().Should().Be(expected);
        }
        public void Gets_article_list_count_as_dto_collection()
        {
            //Arrange
            HtmlDocument doc = _articleListTestBuilder.ArticleList_withoutAbstract_HtmlDocument();
            ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>();
            int expected = 25;

            //Act
            var sut = _scraper.ScrapeArticleList(request, doc, false);

            //Assert
            sut.Should().NotBeNull();
            sut.Result.Count().Should().Be(expected);
        }
        public async Task <ScrapeResultDto <Article> > SubmissionsBySubjectCodeAsync(string url, string subjectCode, CancellationToken cancellationToken)
        {
            ScrapeResultDto <Article> newSubmissionResult = new ScrapeResultDto <Article>();

            try
            {
                ScrapeResultDto <ArticleItemDto> scrapeResult =
                    await _articleListScraper.GetArticles(url, true, cancellationToken);

                newSubmissionResult.ContinueUrl = scrapeResult.ContinueUrl;
                newSubmissionResult.RequestUrl  = scrapeResult.RequestUrl;

                List <Article> articles = MapArticlesToDomain(scrapeResult.Result);

                if (articles == null)
                {
                    return(newSubmissionResult);
                }

                //Only include Submisisons
                List <Article> submissions = articles
                                             .Where(a => a.ScrapeContext == ArticleScrapeContextEnum.Submission)
                                             ?.ToList();

                //Fetch existing Articles scraped today
                var existingIds = _context.SubjectItemArticles
                                  .Where(j => j.Article.DisplayDate.Date == DateTime.Now.Date
                                         //&& j.SubjectItem.Code == subjectCode
                                         && j.SubjectItem.IsPrimary)
                                  ?.Select(a => a.Article.ArxivId)
                                  ?.ToList();

                //Filter out new articles not in database
                List <Article> newSubmissions = submissions
                                                .Where(a => !existingIds.Contains(a.ArxivId))
                                                .ToList();

                int repoResult = _repo.SaveBySubjectGroup(newSubmissions);
                newSubmissionResult.IsSucess = repoResult >= 1;
                newSubmissionResult.Result   = newSubmissions;

                return(newSubmissionResult);
            }
            catch (Exception ex)
            {
                newSubmissionResult.Exception = ex;
                return(newSubmissionResult);
            }
        }
        public void Gets_article_abstract_text()
        {
            //Arrange
            HtmlDocument doc = _articleListTestBuilder.ArticleList_withAbstract_HtmlDocument();
            ScrapeResultDto <ArticleItemDto> request = new ScrapeResultDto <ArticleItemDto>();
            string expected = _articleListTestBuilder.WithAbstractHtml_ArticleItem2_AbstractText();

            //Act
            var sut = _scraper.ScrapeArticleList(request, doc, true);

            //Assert
            sut.Result.Should().NotBeEmpty();
            sut.Result.Should().HaveCountGreaterOrEqualTo(2);
            Assert.Equal(sut.Result[1].AbstractText, expected);
        }
        public async Task <ScrapeResultDto <Article> > CatchupBySubjectGroupAsync(string url, CancellationToken cancellationToken)
        {
            ScrapeResultDto <Article> catcupResult = new ScrapeResultDto <Article>();

            try
            {
                ScrapeResultDto <ArticleItemDto> dtoResult =
                    await _articleListScraper.GetCatchUpArticles(url, true, cancellationToken);

                catcupResult.ContinueUrl = dtoResult.ContinueUrl;
                catcupResult.RequestUrl  = dtoResult.RequestUrl;

                List <Article> articles = MapArticlesToDomain(dtoResult.Result);

                if (articles == null || articles.Count == 0)
                {
                    return(catcupResult);
                }

                // Only include in Catchup only
                List <Article> catchups = catcupResult.Result
                                          .Where(a => a.ScrapeContext == ArticleScrapeContextEnum.CatchUp)
                                          ?.ToList();

                //Filter existing Articles
                var existingArxivIds = await _context.SubjectItemArticles
                                       .Where(j => j.Article.DisplayDate.Date == DateTime.Now.Date)
                                       ?.Select(a => a.Article.ArxivId)
                                       ?.ToListAsync();

                List <Article> newCatchups = catchups
                                             .Where(a => !existingArxivIds.Contains(a.ArxivId))
                                             .ToList();

                //Persist to Database
                int success = _repo.SaveBySubjectGroup(newCatchups);

                catcupResult.IsSucess = success >= 1;
                catcupResult.Result   = newCatchups;

                return(catcupResult);
            }
            catch (Exception ex)
            {
                catcupResult.Exception = ex;
                return(catcupResult);
            }
        }
Beispiel #8
0
        public ScrapeResultDto <ArticleItemDto> ScrapeArticleList(ScrapeResultDto <ArticleItemDto> request, HtmlDocument htmlDocument, bool includeAbstract = true)
        {
            try
            {
                var parseMain = (from info in htmlDocument.DocumentNode.SelectNodes("//div[@id='dlpage']")
                                 from h3Element in info.SelectNodes("h3")
                                 let dlElement = h3Element.GetNextSibling("dl")
                                                 let dte = dlElement.SelectNodes($"//span[@class='list-identifier']")
                                                           let dde = dlElement.SelectNodes($"//div[@class='meta']")
                                                                     where dlElement != null
                                                                     select new
                {
                    h3 = h3Element,
                    dl = dlElement,

                    dt = dlElement.Descendants("dt"),
                    dd = dlElement.Descendants("dd")
                }).ToList();

                if (parseMain == null || parseMain.Count() < 1)
                {
                    return(request);
                }

                for (int i = 0; i < parseMain.Count(); i++)
                {
                    var h3Elem  = parseMain[i].h3.InnerText;
                    var dt_list = parseMain[i].dl.Descendants("dt").ToArray();
                    var dd_list = parseMain[i].dl.Descendants("dd").ToArray();

                    var list = ProcessElems(h3Elem, dt_list, dd_list, includeAbstract);

                    if (list != null && list.Count > 0)
                    {
                        request.Result.AddRange(list);
                        request.IsSucess = true;
                    }
                }
            }
            catch (Exception)
            {
                request.IsSucess = false;
            }

            return(request);
        }
Beispiel #9
0
        public ScrapeResultDto <ArticleItemDto> ScrapeCatchUpArticleList(ScrapeResultDto <ArticleItemDto> request, HtmlDocument htmlDocument, bool includeAbstract = true)
        {
            try
            {
                var parsedHtml = (from info in htmlDocument.DocumentNode.SelectNodes("//div[@id='dlpage']")
                                  from dlElement in info.SelectNodes("dl")
                                  let olElement = info.SelectSingleNode("ol")
                                                  let h1Page = info.SelectSingleNode("h1")
                                                               let parentH3 = dlElement.GetPreviousSibling("h2")
                                                                              select new
                {
                    ol = olElement,
                    h1 = h1Page,
                    date_info = parentH3,
                    dl = dlElement
                }).ToList();

                if (parsedHtml == null || parsedHtml.Count < 0)
                {
                    return(request);
                }

                request.ContinueUrl = GetCatchUpContinueUrl(parsedHtml[0]?.ol?.ChildNodes);

                //Html Page title information -used to eastablish the Scrape Context Enum (New, Recent, Catchup etc...)
                var pageInfo = parsedHtml[0].h1?.InnerText;

                for (int i = 0; i < parsedHtml.Count(); i++)
                {
                    var list = ProcessCatchUpElements(
                        pageInfo,
                        parsedHtml[i].date_info?.InnerText,
                        parsedHtml[i].dl.Descendants("dt").ToArray(),
                        parsedHtml[i].dl.Descendants("dd").ToArray(),
                        true);
                    request.Result.AddRange(list);
                }
            }
            catch (Exception) { }

            return(request);
        }