public async Task Scrape(Uri uri) { baseUri = uri; if (!DisableRobotsProtocol) { var robotsUri = new Uri(uri.GetLeftPart(UriPartial.Authority) + "/robots.txt"); var robotsTxt = await httpClient.GetString(robotsUri); Robots.Load(robotsTxt, httpClient.UserAgentName); } DoScrape(uri).Wait(); }
//This is where the rubber hits the road async Task DoScrape(Uri uri) { OnScrape(uri); if (endDateTime.HasValue && DateTimeProvider.UtcNow > endDateTime) { return; } if (!scrapedUris.TryAdd(uri)) { return; } if (!DisableRobotsProtocol && !Robots.PathIsAllowed(uri.PathAndQuery)) { return; } var htmlDoc = new HtmlDoc { Uri = uri }; try { htmlDoc.Html = await httpClient.GetString(uri); } catch (Exception exception) { OnHttpClientException(exception); } if (string.IsNullOrEmpty(htmlDoc.Html)) { return; } if (!(ObserverLinkFilter != null && !ObserverLinkFilter.IsMatch(uri.ToString()))) { NotifyObservers(htmlDoc); } var pageBase = htmlDoc.Uri.Segments.Last().Contains('.') ? htmlDoc.Uri.ToString().Substring(0, htmlDoc.Uri.ToString().LastIndexOf('/')) : htmlDoc.Uri.ToString(); if (!pageBase.EndsWith("/")) { pageBase += "/"; } var pageBaseUri = new Uri(pageBase); //only use of the CsQuery lib found so far CQ cq = htmlDoc.Html; //Doing some selecting: anchors and hrefs using JQuery-like syntax //out of the box, DoScrape does only the simplest finding of links var links = cq["a"].Select(x => x.GetAttribute("href")).Where(x => x != null); //looks like we're setup to not follow external links var localLinks = LocalLinks(links).Select(x => NormalizeLink(x, pageBaseUri)).Where(x => x.ToString().StartsWith(baseUri.ToString()) && x.ToString().Length <= 2048); if (IncludeLinks != null) { localLinks = localLinks.Where(x => IncludeLinks.IsMatch(x.ToString())); } if (IgnoreLinks != null) { localLinks = localLinks.Where(x => !IgnoreLinks.IsMatch(x.ToString())); } if (MaxDepth.HasValue) { localLinks = localLinks.Where(x => x.Segments.Length <= MaxDepth + 1); } var tasks = localLinks.Select(DoScrape).ToArray(); //recursive call to scape the links found Task.WaitAll(tasks); }
async Task DoScrape(Uri uri) { OnScrape(uri); if (endDateTime.HasValue && DateTimeProvider.UtcNow > endDateTime) { return; } if (!scrapedUris.TryAdd(uri)) { return; } if (!DisableRobotsProtocol && !Robots.PathIsAllowed(uri.PathAndQuery)) { return; } var htmlDoc = new HtmlDoc { Uri = uri }; try { htmlDoc.Html = await httpClient.GetString(uri); } catch (Exception exception) { OnHttpClientException(exception); } if (string.IsNullOrEmpty(htmlDoc.Html)) { return; } if (!(ObserverLinkFilter != null && !ObserverLinkFilter.IsMatch(uri.ToString()))) { NotifyObservers(htmlDoc); } var pageBase = htmlDoc.Uri.Segments.Last().Contains('.') ? htmlDoc.Uri.ToString().Substring(0, htmlDoc.Uri.ToString().LastIndexOf('/')) : htmlDoc.Uri.ToString(); if (!pageBase.EndsWith("/")) { pageBase += "/"; } var pageBaseUri = new Uri(pageBase); CQ cq = htmlDoc.Html; var links = cq["a"].Select(x => x.GetAttribute("href")).Where(x => x != null); var localLinks = LocalLinks(links).Select(x => NormalizeLink(x, pageBaseUri)).Where(x => x.ToString().StartsWith(baseUri.ToString()) && x.ToString().Length <= 2048); if (IncludeLinks != null) { localLinks = localLinks.Where(x => IncludeLinks.IsMatch(x.ToString())); } if (IgnoreLinks != null) { localLinks = localLinks.Where(x => !IgnoreLinks.IsMatch(x.ToString())); } if (MaxDepth.HasValue) { localLinks = localLinks.Where(x => x.Segments.Length <= MaxDepth + 1); } var tasks = localLinks.Select(DoScrape).ToArray(); Task.WaitAll(tasks); }