Example #1
0
        public async Task <CrawledContent> Parse(Uri uri, HttpResponseMessage response, CrawlSettings settings)
        {
            var crawledContent = new CrawledContent
            {
                ContentType     = response.Content.Headers.ContentType.MediaType,
                CharacterSet    = response.Content.Headers.ContentType.CharSet,
                ContentEncoding = string.Join(",", response.Content.Headers.ContentEncoding)
            };

            var contentStream = new MemoryStream();

            await(await response.Content.ReadAsStreamAsync()).CopyToAsync(contentStream);
            crawledContent.RawContent = new StreamReader(contentStream).ReadToEnd();
            contentStream.Seek(0, SeekOrigin.Begin);

            using (contentStream)
            {
                var parsedContent = Parse(uri, contentStream);

                crawledContent.CanonicalUri = parsedContent.CanonicalUri;

                if (response.Headers.Contains("X-Robots-Tag"))
                {
                    var robotsHeaderValues = response.Headers.GetValues("X-Robots-Tag");
                    parsedContent.NoIndex = robotsHeaderValues.Any(r =>
                                                                   r.IndexOf("noindex", StringComparison.InvariantCultureIgnoreCase) != -1
                                                                   );
                    parsedContent.NoFollow = robotsHeaderValues.Any(r =>
                                                                    r.IndexOf("nofollow", StringComparison.InvariantCultureIgnoreCase) != -1
                                                                    );
                }

                if (parsedContent.NoIndex)
                {
                    crawledContent.RawContent = null;
                }

                if (!parsedContent.NoFollow)
                {
                    crawledContent.Links = parsedContent.Links;
                }

                return(crawledContent);
            }
        }
        public CrawledContent Parse(Uri requestUri, CrawlHeaders headers, Stream contentStream)
        {
            var crawledContent = new CrawledContent
            {
                ContentType     = headers.ContentHeaders.ContentType?.MediaType,
                CharacterSet    = headers.ContentHeaders.ContentType?.CharSet,
                ContentEncoding = headers.ContentHeaders.ContentEncoding != null?string.Join(",", headers.ContentHeaders.ContentEncoding) : null
            };

            var document = new HtmlDocument();

            document.Load(contentStream);

            var pageRobotRules = new List <string>();

            if (headers.ResponseHeaders.Contains("X-Robots-Tag"))
            {
                var robotsHeaderValues = headers.ResponseHeaders.GetValues("X-Robots-Tag");
                pageRobotRules.AddRange(robotsHeaderValues);
            }

            var metaNodes = document.DocumentNode.SelectNodes("html/head/meta");

            if (metaNodes != null)
            {
                var robotsMetaValue = metaNodes
                                      .Where(n => n.Attributes.Any(a => a.Name == "name" && a.Value.Equals("robots", StringComparison.InvariantCultureIgnoreCase)))
                                      .SelectMany(n => n.Attributes.Where(a => a.Name == "content").Select(a => a.Value))
                                      .FirstOrDefault();
                if (robotsMetaValue != null)
                {
                    pageRobotRules.Add(robotsMetaValue);
                }
            }

            crawledContent.PageRobotRules = pageRobotRules.ToArray();
            crawledContent.CanonicalUri   = GetCanonicalUri(document, requestUri);
            crawledContent.Links          = GetLinks(document, requestUri).ToArray();

            return(crawledContent);
        }
Example #3
0
        public void AddResult(Uri requestUri, CrawledContent content)
        {
            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules);
                if (!robotsPageDefinition.CanIndex(Settings.UserAgent))
                {
                    Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule.");
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.RobotsBlocked,
                        Requests      = crawlState.Requests,
                        RedirectChain = crawlState.Redirects
                    });
                }
                else
                {
                    Logger?.LogDebug($"Result for {requestUri} has completed successfully with content.");

                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.Crawled,
                        RedirectChain = crawlState.Redirects,
                        Requests      = crawlState.Requests,
                        Content       = content
                    });

                    if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
                    {
                        foreach (var crawlLink in content.Links)
                        {
                            AddLink(crawlLink);
                        }
                    }
                }
            }
        }
Example #4
0
        public void AddResult(Uri requestUri, CrawledContent content)
        {
            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                if (content != null)
                {
                    var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules);
                    if (!robotsPageDefinition.CanIndex(Settings.UserAgent))
                    {
                        AddResult(new CrawledUri
                        {
                            Location      = crawlState.Location,
                            Status        = CrawlStatus.RobotsBlocked,
                            Requests      = crawlState.Requests,
                            RedirectChain = crawlState.Redirects
                        });
                        return;
                    }

                    if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
                    {
                        foreach (var crawlLink in content.Links)
                        {
                            AddLink(crawlLink);
                        }
                    }
                }

                AddResult(new CrawledUri
                {
                    Location      = crawlState.Location,
                    Status        = CrawlStatus.Crawled,
                    RedirectChain = crawlState.Redirects,
                    Requests      = crawlState.Requests,
                    Content       = content
                });
            }
        }