Пример #1
0
        private static IEnumerable <string> FilterByRegularExpression(DotnetCrawlerRequest request, IEnumerable <string> links)
        {
            if (!string.IsNullOrWhiteSpace(request.Regex))
            {
                var regex = new Regex(request.Regex);

                if (regex != null)
                {
                    links = links.Where(x => regex.IsMatch(x));
                }
            }

            return(links);
        }
Пример #2
0
        public void GetLinksAsync_ThrowsArgumentException()
        {
            DotnetCrawlerRequest request      = new DotnetCrawlerRequest();
            HtmlDocument         htmlDocument = new HtmlDocument();

            Mock <IWebClientService> webClientMock = new Mock <IWebClientService>();

            webClientMock.Setup(a => a.FromWebAsync(It.IsAny <string>())).ReturnsAsync(htmlDocument);

            DotnetCrawlerPageLinkReader linkReader = new DotnetCrawlerPageLinkReader(webClientMock.Object);

            linkReader.Invoking(y => y.GetLinksAsync(request, -1))
            .Should().Throw <ArgumentOutOfRangeException>()
            .Where(e => e.Message.StartsWith("Specified argument was out of the range"));
        }
Пример #3
0
        public async void GetLinksAsync_ReturnsNoLinks()
        {
            DotnetCrawlerRequest request      = new DotnetCrawlerRequest();
            HtmlDocument         htmlDocument = new HtmlDocument();

            Mock <IWebClientService> webClientMock = new Mock <IWebClientService>();

            webClientMock.Setup(a => a.FromWebAsync(It.IsAny <string>())).ReturnsAsync(htmlDocument);

            DotnetCrawlerPageLinkReader linkReader = new DotnetCrawlerPageLinkReader(webClientMock.Object);
            IEnumerable <string>        links      = await linkReader.GetLinksAsync(request);

            links.Should().BeEmpty();
            webClientMock.Verify(m => m.FromWebAsync(It.IsAny <string>()), Times.Exactly(1));
        }
Пример #4
0
        private async Task <IEnumerable <string> > GetPageLinksAsync(DotnetCrawlerRequest request)
        {
            try
            {
                var htmlDocument = await _webClientService.FromWebAsync(request.Url);

                IEnumerable <string> links = ProcessLinks(htmlDocument);

                links = FilterByRegularExpression(request, links);

                return(links);
            }
            catch (Exception)
            {
                return(Enumerable.Empty <string>());
            }
        }
Пример #5
0
        public async void GetLinksAsync_ReturnsAllLinks()
        {
            string html = RootHtml();

            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(html);

            DotnetCrawlerRequest request = new DotnetCrawlerRequest();

            Mock <IWebClientService> webClientMock = new Mock <IWebClientService>();

            webClientMock.Setup(a => a.FromWebAsync(It.IsAny <string>())).ReturnsAsync(htmlDocument);

            DotnetCrawlerPageLinkReader linkReader = new DotnetCrawlerPageLinkReader(webClientMock.Object);
            IEnumerable <string>        links      = await linkReader.GetLinksAsync(request);

            links.Should().NotBeEmpty().And.HaveCount(3).And.ContainItemsAssignableTo <string>();
            webClientMock.Verify(m => m.FromWebAsync(It.IsAny <string>()), Times.Exactly(1));
        }
Пример #6
0
        public async Task <IEnumerable <string> > GetLinksAsync(DotnetCrawlerRequest request, int level = 0)
        {
            if (level < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(level));
            }

            var rootUrls = await GetPageLinksAsync(request);

            if (level == 0)
            {
                return(rootUrls);
            }

            var links = await GetAllPagesLinks(rootUrls);

            --level;
            var tasks = await Task.WhenAll(links.Select(link => GetLinksAsync(request, level)));

            return(tasks.SelectMany(l => l));
        }
Пример #7
0
 public DotnetCrawler <TEntity> AddRequest(DotnetCrawlerRequest request)
 {
     Request = request;
     return(this);
 }