private static IEnumerable <string> FilterByRegularExpression(DotnetCrawlerRequest request, IEnumerable <string> links) { if (!string.IsNullOrWhiteSpace(request.Regex)) { var regex = new Regex(request.Regex); if (regex != null) { links = links.Where(x => regex.IsMatch(x)); } } return(links); }
public void GetLinksAsync_ThrowsArgumentException() { DotnetCrawlerRequest request = new DotnetCrawlerRequest(); HtmlDocument htmlDocument = new HtmlDocument(); Mock <IWebClientService> webClientMock = new Mock <IWebClientService>(); webClientMock.Setup(a => a.FromWebAsync(It.IsAny <string>())).ReturnsAsync(htmlDocument); DotnetCrawlerPageLinkReader linkReader = new DotnetCrawlerPageLinkReader(webClientMock.Object); linkReader.Invoking(y => y.GetLinksAsync(request, -1)) .Should().Throw <ArgumentOutOfRangeException>() .Where(e => e.Message.StartsWith("Specified argument was out of the range")); }
public async void GetLinksAsync_ReturnsNoLinks() { DotnetCrawlerRequest request = new DotnetCrawlerRequest(); HtmlDocument htmlDocument = new HtmlDocument(); Mock <IWebClientService> webClientMock = new Mock <IWebClientService>(); webClientMock.Setup(a => a.FromWebAsync(It.IsAny <string>())).ReturnsAsync(htmlDocument); DotnetCrawlerPageLinkReader linkReader = new DotnetCrawlerPageLinkReader(webClientMock.Object); IEnumerable <string> links = await linkReader.GetLinksAsync(request); links.Should().BeEmpty(); webClientMock.Verify(m => m.FromWebAsync(It.IsAny <string>()), Times.Exactly(1)); }
private async Task <IEnumerable <string> > GetPageLinksAsync(DotnetCrawlerRequest request) { try { var htmlDocument = await _webClientService.FromWebAsync(request.Url); IEnumerable <string> links = ProcessLinks(htmlDocument); links = FilterByRegularExpression(request, links); return(links); } catch (Exception) { return(Enumerable.Empty <string>()); } }
public async void GetLinksAsync_ReturnsAllLinks() { string html = RootHtml(); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); DotnetCrawlerRequest request = new DotnetCrawlerRequest(); Mock <IWebClientService> webClientMock = new Mock <IWebClientService>(); webClientMock.Setup(a => a.FromWebAsync(It.IsAny <string>())).ReturnsAsync(htmlDocument); DotnetCrawlerPageLinkReader linkReader = new DotnetCrawlerPageLinkReader(webClientMock.Object); IEnumerable <string> links = await linkReader.GetLinksAsync(request); links.Should().NotBeEmpty().And.HaveCount(3).And.ContainItemsAssignableTo <string>(); webClientMock.Verify(m => m.FromWebAsync(It.IsAny <string>()), Times.Exactly(1)); }
public async Task <IEnumerable <string> > GetLinksAsync(DotnetCrawlerRequest request, int level = 0) { if (level < 0) { throw new ArgumentOutOfRangeException(nameof(level)); } var rootUrls = await GetPageLinksAsync(request); if (level == 0) { return(rootUrls); } var links = await GetAllPagesLinks(rootUrls); --level; var tasks = await Task.WhenAll(links.Select(link => GetLinksAsync(request, level))); return(tasks.SelectMany(l => l)); }
public DotnetCrawler <TEntity> AddRequest(DotnetCrawlerRequest request) { Request = request; return(this); }