Exemplo n.º 1
0
        public async Task Scrape(Uri uri)
        {
            baseUri = uri;

            if (!DisableRobotsProtocol)
            {
                var robotsUri = new Uri(uri.GetLeftPart(UriPartial.Authority) + "/robots.txt");
                var robotsTxt = await httpClient.GetString(robotsUri);

                Robots.Load(robotsTxt, httpClient.UserAgentName);
            }
            DoScrape(uri).Wait();
        }
Exemplo n.º 2
0
        //This is where the rubber hits the road
        async Task DoScrape(Uri uri)
        {
            OnScrape(uri);
            if (endDateTime.HasValue && DateTimeProvider.UtcNow > endDateTime)
            {
                return;
            }
            if (!scrapedUris.TryAdd(uri))
            {
                return;
            }
            if (!DisableRobotsProtocol && !Robots.PathIsAllowed(uri.PathAndQuery))
            {
                return;
            }
            var htmlDoc = new HtmlDoc {
                Uri = uri
            };

            try
            {
                htmlDoc.Html = await httpClient.GetString(uri);
            }
            catch (Exception exception)
            {
                OnHttpClientException(exception);
            }
            if (string.IsNullOrEmpty(htmlDoc.Html))
            {
                return;
            }
            if (!(ObserverLinkFilter != null && !ObserverLinkFilter.IsMatch(uri.ToString())))
            {
                NotifyObservers(htmlDoc);
            }

            var pageBase = htmlDoc.Uri.Segments.Last().Contains('.') ? htmlDoc.Uri.ToString().Substring(0, htmlDoc.Uri.ToString().LastIndexOf('/')) : htmlDoc.Uri.ToString();

            if (!pageBase.EndsWith("/"))
            {
                pageBase += "/";
            }
            var pageBaseUri = new Uri(pageBase);

            //only use of the CsQuery lib found so far
            CQ cq = htmlDoc.Html;

            //Doing some selecting: anchors and hrefs using JQuery-like syntax
            //out of the box, DoScrape does only the simplest finding of links
            var links = cq["a"].Select(x => x.GetAttribute("href")).Where(x => x != null);

            //looks like we're setup to not follow external links
            var localLinks = LocalLinks(links).Select(x => NormalizeLink(x, pageBaseUri)).Where(x => x.ToString().StartsWith(baseUri.ToString()) && x.ToString().Length <= 2048);

            if (IncludeLinks != null)
            {
                localLinks = localLinks.Where(x => IncludeLinks.IsMatch(x.ToString()));
            }
            if (IgnoreLinks != null)
            {
                localLinks = localLinks.Where(x => !IgnoreLinks.IsMatch(x.ToString()));
            }
            if (MaxDepth.HasValue)
            {
                localLinks = localLinks.Where(x => x.Segments.Length <= MaxDepth + 1);
            }
            var tasks = localLinks.Select(DoScrape).ToArray(); //recursive call to scape the links found

            Task.WaitAll(tasks);
        }
Exemplo n.º 3
0
        async Task DoScrape(Uri uri)
        {
            OnScrape(uri);
            if (endDateTime.HasValue && DateTimeProvider.UtcNow > endDateTime)
            {
                return;
            }
            if (!scrapedUris.TryAdd(uri))
            {
                return;
            }
            if (!DisableRobotsProtocol && !Robots.PathIsAllowed(uri.PathAndQuery))
            {
                return;
            }
            var htmlDoc = new HtmlDoc {
                Uri = uri
            };

            try
            {
                htmlDoc.Html = await httpClient.GetString(uri);
            }
            catch (Exception exception)
            {
                OnHttpClientException(exception);
            }
            if (string.IsNullOrEmpty(htmlDoc.Html))
            {
                return;
            }
            if (!(ObserverLinkFilter != null && !ObserverLinkFilter.IsMatch(uri.ToString())))
            {
                NotifyObservers(htmlDoc);
            }

            var pageBase = htmlDoc.Uri.Segments.Last().Contains('.') ? htmlDoc.Uri.ToString().Substring(0, htmlDoc.Uri.ToString().LastIndexOf('/')) : htmlDoc.Uri.ToString();

            if (!pageBase.EndsWith("/"))
            {
                pageBase += "/";
            }
            var pageBaseUri = new Uri(pageBase);
            CQ  cq          = htmlDoc.Html;
            var links       = cq["a"].Select(x => x.GetAttribute("href")).Where(x => x != null);
            var localLinks  = LocalLinks(links).Select(x => NormalizeLink(x, pageBaseUri)).Where(x => x.ToString().StartsWith(baseUri.ToString()) && x.ToString().Length <= 2048);

            if (IncludeLinks != null)
            {
                localLinks = localLinks.Where(x => IncludeLinks.IsMatch(x.ToString()));
            }
            if (IgnoreLinks != null)
            {
                localLinks = localLinks.Where(x => !IgnoreLinks.IsMatch(x.ToString()));
            }
            if (MaxDepth.HasValue)
            {
                localLinks = localLinks.Where(x => x.Segments.Length <= MaxDepth + 1);
            }
            var tasks = localLinks.Select(DoScrape).ToArray();

            Task.WaitAll(tasks);
        }