public async Task <CrawledContent> Parse(Uri uri, HttpResponseMessage response, CrawlSettings settings) { var crawledContent = new CrawledContent { ContentType = response.Content.Headers.ContentType.MediaType, CharacterSet = response.Content.Headers.ContentType.CharSet, ContentEncoding = string.Join(",", response.Content.Headers.ContentEncoding) }; var contentStream = new MemoryStream(); await(await response.Content.ReadAsStreamAsync()).CopyToAsync(contentStream); crawledContent.RawContent = new StreamReader(contentStream).ReadToEnd(); contentStream.Seek(0, SeekOrigin.Begin); using (contentStream) { var parsedContent = Parse(uri, contentStream); crawledContent.CanonicalUri = parsedContent.CanonicalUri; if (response.Headers.Contains("X-Robots-Tag")) { var robotsHeaderValues = response.Headers.GetValues("X-Robots-Tag"); parsedContent.NoIndex = robotsHeaderValues.Any(r => r.IndexOf("noindex", StringComparison.InvariantCultureIgnoreCase) != -1 ); parsedContent.NoFollow = robotsHeaderValues.Any(r => r.IndexOf("nofollow", StringComparison.InvariantCultureIgnoreCase) != -1 ); } if (parsedContent.NoIndex) { crawledContent.RawContent = null; } if (!parsedContent.NoFollow) { crawledContent.Links = parsedContent.Links; } return(crawledContent); } }
public CrawledContent Parse(Uri requestUri, CrawlHeaders headers, Stream contentStream) { var crawledContent = new CrawledContent { ContentType = headers.ContentHeaders.ContentType?.MediaType, CharacterSet = headers.ContentHeaders.ContentType?.CharSet, ContentEncoding = headers.ContentHeaders.ContentEncoding != null?string.Join(",", headers.ContentHeaders.ContentEncoding) : null }; var document = new HtmlDocument(); document.Load(contentStream); var pageRobotRules = new List <string>(); if (headers.ResponseHeaders.Contains("X-Robots-Tag")) { var robotsHeaderValues = headers.ResponseHeaders.GetValues("X-Robots-Tag"); pageRobotRules.AddRange(robotsHeaderValues); } var metaNodes = document.DocumentNode.SelectNodes("html/head/meta"); if (metaNodes != null) { var robotsMetaValue = metaNodes .Where(n => n.Attributes.Any(a => a.Name == "name" && a.Value.Equals("robots", StringComparison.InvariantCultureIgnoreCase))) .SelectMany(n => n.Attributes.Where(a => a.Name == "content").Select(a => a.Value)) .FirstOrDefault(); if (robotsMetaValue != null) { pageRobotRules.Add(robotsMetaValue); } } crawledContent.PageRobotRules = pageRobotRules.ToArray(); crawledContent.CanonicalUri = GetCanonicalUri(document, requestUri); crawledContent.Links = GetLinks(document, requestUri).ToArray(); return(crawledContent); }
public void AddResult(Uri requestUri, CrawledContent content) { if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) { Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); } else { Logger?.LogDebug($"Result for {requestUri} has completed successfully with content."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = content }); if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) { foreach (var crawlLink in content.Links) { AddLink(crawlLink); } } } } }
public void AddResult(Uri requestUri, CrawledContent content) { if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { if (content != null) { var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) { foreach (var crawlLink in content.Links) { AddLink(crawlLink); } } } AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = content }); } }