public static RobotsTextResponse ParseText(string text) { var response = new RobotsTextResponse(); var startedParsing = false; UserAgent currentAgent = null; int index = 0; while (index < text.Length) { var line = GetLine(text, ref index); if (line.Length == 0) { if (startedParsing) { startedParsing = false; } continue; } if (line.StartsWith("User-agent:") && line.Length > 12 && (line[12] == '*' || IsMatchingUserAgent(line))) { if (line[12] == '*') { response.AnyAgent = new UserAgent(); currentAgent = response.AnyAgent; currentAgent.Name = "*"; startedParsing = true; } else if (IsMatchingUserAgent(line)) { response.MyAgent = new UserAgent(); currentAgent = response.MyAgent; currentAgent.Name = line.Substring(12); startedParsing = true; } else { continue; } } if (!startedParsing) { continue; } if (line.StartsWith("Disallow:") && line.Length > 10) { currentAgent.DisallowedUrls.Add(line.Substring(10)); } else if (line.StartsWith("Allow:") && line.Length > 7) { currentAgent.AllowedUrls.Add(line.Substring(7)); } } return(response); }
public static bool IsAllowed(RobotsTextResponse robots, Uri baseUrl) { if (robots == null) { return(true); } var allowed = true; //check my agent first if (robots.MyAgent != null) { allowed = IsAllowedUserAgent(robots.MyAgent, baseUrl); } else if (robots.AnyAgent != null) { allowed = IsAllowedUserAgent(robots.AnyAgent, baseUrl); } return(allowed); }
public static RobotsTextResponse GetRobotsResponse(Uri uri, int retries = 0, string cacheKey = null) { if (cacheKey == null) { //in case of redirects, keep same key cacheKey = uri.Authority; } if (retries == RunSettings.MaxRetries) { ApplicationCache.Robots.Add(cacheKey, null); return(null); } retries++; var robots = new Uri($"{uri.Scheme}://{uri.Authority}/robots.txt"); var page = Download(robots); if (page == null) { ApplicationCache.Robots.Add(cacheKey, null); return(null); } if (page.StatusCode != HttpStatusCode.OK && page.StatusCode != HttpStatusCode.NotModified) { if ((page.StatusCode == HttpStatusCode.Moved || page.StatusCode == HttpStatusCode.Redirect && page.StatusCode == HttpStatusCode.TemporaryRedirect) && page.Headers.ContainsKey("Location")) { return(GetRobotsResponse(new Uri(page.Headers["Location"]), retries, cacheKey)); } ApplicationCache.Robots.Add(cacheKey, null); return(null); } var parsedRobots = RobotsTextResponse.ParseText(page.Html); ApplicationCache.Robots.Add(cacheKey, parsedRobots); return(parsedRobots); }
private void ProcessUrl(string siteUrl, int retries = 0) { if (retries == RunSettings.MaxRetries) { return; } retries++; //Console.WriteLine($"Crawling {siteUrl}"); var baseUrl = new Uri(siteUrl); //get robots RobotsTextResponse robots = null; if (!ApplicationCache.Robots.TryGetValue(baseUrl.Authority, out robots)) { robots = TcpCrawlingClient.GetRobotsResponse(baseUrl); } if (ApplicationCache.Robots.Count > RunSettings.MaxRobotsSize) { ApplicationCache.TrimRobots(); } //check robots if (!RobotsTextResponse.IsAllowed(robots, baseUrl)) { return; } var page = TcpCrawlingClient.Download(baseUrl); if (ApplicationCache.VisitedUrls.Count > RunSettings.MaxVisitedUrls) { ApplicationCache.TrimVisitedUrls(); } ApplicationCache.VisitedUrls.Add(siteUrl); if (page == null) { return; } if ((page.StatusCode == HttpStatusCode.MovedPermanently || page.StatusCode == HttpStatusCode.TemporaryRedirect) && page.Headers.ContainsKey("Location")) { ProcessRedirectedLocation(page, baseUrl, retries); return; } if (page.StatusCode != HttpStatusCode.OK) { return; } var doc = new HtmlDocument(); doc.LoadHtml(page.Html); //no index is returned, no follow means we index this but don't take anchors var metaTags = doc.DocumentNode.Descendants("meta").ToList(); bool containsNoIndex = false, containsNoFollow = false; var robotsMeta = metaTags.FirstOrDefault(x => x.Attributes["name"]?.Value == "robots")?.Attributes["content"]?.Value; if (robotsMeta != null) { containsNoFollow = robotsMeta.Contains("nofollow"); containsNoIndex = robotsMeta.Contains("noindex"); } var anchors = doc.DocumentNode.Descendants("a"); if (!containsNoIndex) { //we can save it on the disk so the indexer can do its job FileTreeWorker.CreateWebsiteTree(baseUrl, page.Html); } if (containsNoFollow) { //do not extract the links return; } foreach (var a in anchors) { var currentAnchor = a.Attributes["href"]?.Value; if (string.IsNullOrEmpty(currentAnchor)) { continue; } var rel = a.Attributes["rel"]?.Value; if (rel != null && rel == "nofollow") { continue; } UrlFrontier.Enqueue(baseUrl, currentAnchor); } }